package index; import java.io.IOException; import java.io.Reader; import java.ut
ID: 3710518 • Letter: P
Question
package index;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.Set;
import comparators.TfIdfComparator;
import documents.DocumentId;
/**
* A simplified document indexer and search engine.
*
* Documents are added to the engine one-by-one, and uniquely identified by a DocumentId.
*
* Documents are internally represented as "terms", which are lowercased versions of each word
* in the document.
*
* Queries for terms are also made on the lowercased version of the term. Terms are
* therefore case-insensitive.
*
* Lookups for documents can be done by term, and the most relevant document(s) to a specific term
* (as computed by tf-idf) can also be retrieved.
*
* See:
* - <https://en.wikipedia.org/wiki/Inverted_index>
* - <https://en.wikipedia.org/wiki/Search_engine_(computing)>
* - <https://en.wikipedia.org/wiki/Tf%E2%80%93idf>
*
* @author Marc Liberatore
*
*/
public class SearchEngine {
/**
* Inserts a document into the search engine for later analysis and retrieval.
*
* The document is uniquely identified by a documentId; attempts to re-insert the same
* document are ignored.
*
* The document is supplied as a Reader; this method stores the document contents for
* later analysis and retrieval.
*
* @param documentId
* @param reader
* @throws IOException iff the reader throws an exception
*/
public void addDocument(DocumentId documentId, Reader reader) throws IOException {
}
/**
* Returns the set of DocumentIds contained within the search engine that contain a given term.
*
* @param term
* @return the set of DocumentIds that contain a given term
*/
public Set<DocumentId> indexLookup(String term) {
return null;
}
/**
* Returns the term frequency of a term in a particular document.
*
* The term frequency is number of times the term appears in a document.
*
* See
* @param documentId
* @param term
* @return the term frequency of a term in a particular document
* @throws IllegalArgumentException if the documentId has not been added to the engine
*/
public int termFrequency(DocumentId documentId, String term) throws IllegalArgumentException {
return 0;
}
/**
* Returns the inverse document frequency of a term across all documents in the index.
*
* For our purposes, IDF is defined as log ((1 + N) / (1 + M)) where
* N is the number of documents in total, and M
* is the number of documents where the term appears.
*
* @param term
* @return the inverse document frequency of term
*/
public double inverseDocumentFrequency(String term) {
// first calculate N, the number of documents plus one
// loop through all of the documents to calculate M
// finally, calculate and return log ((1 + N) / (1 + M))
// use Math.log to compute the logarithm
return 0;
}
/**
* Returns the tfidf score of a particular term for a particular document.
*
* tfidf is the product of term frequency and inverse document frequency for the given term and document.
*
* @param documentId
* @param term
* @return the tfidf of the the term/document
* @throws IllegalArgumentException if the documentId has not been added to the engine
*/
public double tfIdf(DocumentId documentId, String term) throws IllegalArgumentException {
return 0.0;
}
/**
* Returns a sorted list of documents, most relevant to least relevant, for the given term.
*
* A document with a larger tfidf score is more relevant than a document with a lower tfidf score.
*
* Each document in the returned list must contain the term.
*
* @param term
* @param max the maximum number of documents to return (you may return fewer)
* @return a list of documents sorted in descending order by tfidf
*/
public List<DocumentId> relevanceLookup(String term, int max) {
return null;
}
}
Explanation / Answer
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import comparators.TfIdfComparator;
import documents.DocumentId;
/**
* A simplified document indexer and search engine.
*
* Documents are added to the engine one-by-one, and uniquely identified by a DocumentId.
*
* Documents are internally represented as "terms", which are lowercased versions of each word
* in the document.
*
* Queries for terms are also made on the lowercased version of the term. Terms are
* therefore case-insensitive.
*
* Lookups for documents can be done by term, and the most relevant document(s) to a specific term
* (as computed by tf-idf) can also be retrieved.
*
*
*/
public class SearchEngine {
public Map<String, DocumentId> google = new HashMap<String, DocumentId>();
/**
* Inserts a document into the search engine for later analysis and retrieval.
*
* The document is uniquely identified by a documentId; attempts to re-insert the same
* document are ignored.
*
* The document is supplied as a Reader; this method stores the document contents for
* later analysis and retrieval.
*
* @param documentId
* @param reader
* @throws IOException iff the reader throws an exception
*/
public void addDocument(DocumentId documentId, Reader reader) throws IOException {
String Document = "";
int x = reader.read();
while(x != -1){
Document += (char)x;
x = reader.read();
}
Document = Document.toLowerCase();
DocumentId p = google.get(documentId);
if(p ==null){
google.put(Document, documentId);
}
}
/**
* Returns the set of DocumentIds contained within the search engine that contain a given term.
*
* @param term
* @return the set of DocumentIds that contain a given term
*/
public Set<DocumentId> indexLookup(String term) {
Set<DocumentId> t = new HashSet<DocumentId>();
String k = term.toLowerCase();
for(String doc: google.keySet()){
if(doc.contains(k)){
t.add(google.get(doc));
}
}
return t;
}
/**
* Returns the term frequency of a term in a particular document.
*
* The term frequency is number of times the term appears in a document.
*
* See
* @param documentId
* @param term
* @return the term frequency of a term in a particular document
* @throws IllegalArgumentException if the documentId has not been added to the engine
*/
public int termFrequency(DocumentId documentId, String term) throws IllegalArgumentException {
if(!google.containsValue(documentId)){
throw new IllegalArgumentException();
}
term.toLowerCase();
String Document = "";
for(String i: google.keySet()){
if(google.get(i).equals(documentId)){
Document = i;
break;
}
}
int count = 0;
int i = Document.indexOf(term);
while(i !=-1){
count++;
Document = Document.substring(i+term.length());
i = Document.indexOf(term);
}
return count;
for(i = Document.indexOf(term); i<Integer.MAX_VALUE; i++){
if(i!=-1){
count++;
Document = Document.substring(i + 1);
i = Document.indexOf(term);
}
else{
break;
}
}
}
/**
* Returns the inverse document frequency of a term across all documents in the index.
*
* For our purposes, IDF is defined as log ((1 + N) / (1 + M)) where
* N is the number of documents in total, and M
* is the number of documents where the term appears.
*
* @param term
* @return the inverse document frequency of term
*/
public double inverseDocumentFrequency(String term) {
double googlesize = google.size(), mTotal = 0;
term.toLowerCase();
for(String Document: google.keySet()){
if(Document.indexOf(term) != -1){
mTotal++;
}
}
double value = (double)Math.log((double)(1+googlesize) / (double)(1+ mTotal));
return (double)value;
}
/**
* Returns the tfidf score of a particular term for a particular document.
*
* tfidf is the product of term frequency and inverse document frequency for the given term and document.
*
* @param documentId
* @param term
* @return the tfidf of the the term/document
* @throws IllegalArgumentException if the documentId has not been added to the engine
*/
public double tfIdf(DocumentId documentId, String term) throws IllegalArgumentException {
if(!google.containsValue(documentId)){
throw new IllegalArgumentException();
}
return termFrequency(documentId, term) * inverseDocumentFrequency(term);
}
/**
* Returns a sorted list of documents, most relevant to least relevant, for the given term.
*
* A document with a larger tfidf score is more relevant than a document with a lower tfidf score.
*
* Each document in the returned list must contain the term.
*
* @param term
* @return a list of documents sorted in descending order by tfidf
*/
public List<DocumentId> relevanceLookup(String term) {
List<DocumentId> Documents = new ArrayList<DocumentId>();
Set<DocumentId> dc = indexLookup(term);
TfIdfComparator comparekek = new TfIdfComparator(this, term);
for(DocumentId h: dc){
Documents.add(h);
for(int i = 0; i< Documents.size(); i++){
if(comparekek.compare(h, Documents.get(i)) <= 0){
Documents.add(i, h);
break;
}
if(i == Documents.size()-1)
Documents.add(h);
if(Documents.size() == 0){
Documents.add(h);
}
}
}
Documents.sort(comparekek);
return Documents;
}
}
Related Questions
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.