package index; import java.io.IOException; import java.io.Reader; import java.ut

ID: 3710518 • Letter: P

Question

package index;

import java.io.IOException;

import java.io.Reader;

import java.util.List;

import java.util.Set;

import comparators.TfIdfComparator;

import documents.DocumentId;

/**

* A simplified document indexer and search engine.

* Documents are added to the engine one-by-one, and uniquely identified by a DocumentId.

* Documents are internally represented as "terms", which are lowercased versions of each word

* in the document.

* Queries for terms are also made on the lowercased version of the term. Terms are

* therefore case-insensitive.

* Lookups for documents can be done by term, and the most relevant document(s) to a specific term

* (as computed by tf-idf) can also be retrieved.

* See:

* - <https://en.wikipedia.org/wiki/Inverted_index>

* - <https://en.wikipedia.org/wiki/Search_engine_(computing)>

* - <https://en.wikipedia.org/wiki/Tf%E2%80%93idf>

* @author Marc Liberatore

public class SearchEngine {

/**

* Inserts a document into the search engine for later analysis and retrieval.

* The document is uniquely identified by a documentId; attempts to re-insert the same

* document are ignored.

* The document is supplied as a Reader; this method stores the document contents for

* later analysis and retrieval.

* @param documentId

* @param reader

* @throws IOException iff the reader throws an exception

public void addDocument(DocumentId documentId, Reader reader) throws IOException {

}

/**

* Returns the set of DocumentIds contained within the search engine that contain a given term.

* @param term

* @return the set of DocumentIds that contain a given term

public Set<DocumentId> indexLookup(String term) {

return null;

}

/**

* Returns the term frequency of a term in a particular document.

* The term frequency is number of times the term appears in a document.

* See

* @param documentId

* @param term

* @return the term frequency of a term in a particular document

* @throws IllegalArgumentException if the documentId has not been added to the engine

public int termFrequency(DocumentId documentId, String term) throws IllegalArgumentException {

return 0;

}

/**

* Returns the inverse document frequency of a term across all documents in the index.

* For our purposes, IDF is defined as log ((1 + N) / (1 + M)) where

* N is the number of documents in total, and M

* is the number of documents where the term appears.

* @param term

* @return the inverse document frequency of term

public double inverseDocumentFrequency(String term) {

// first calculate N, the number of documents plus one

// loop through all of the documents to calculate M

// finally, calculate and return log ((1 + N) / (1 + M))

// use Math.log to compute the logarithm

return 0;

}

/**

* Returns the tfidf score of a particular term for a particular document.

* tfidf is the product of term frequency and inverse document frequency for the given term and document.

* @param documentId

* @param term

* @return the tfidf of the the term/document

* @throws IllegalArgumentException if the documentId has not been added to the engine

public double tfIdf(DocumentId documentId, String term) throws IllegalArgumentException {

return 0.0;

}

/**

* Returns a sorted list of documents, most relevant to least relevant, for the given term.

* A document with a larger tfidf score is more relevant than a document with a lower tfidf score.

* Each document in the returned list must contain the term.

* @param term

* @param max the maximum number of documents to return (you may return fewer)

* @return a list of documents sorted in descending order by tfidf

public List<DocumentId> relevanceLookup(String term, int max) {

return null;

}

Explanation / Answer

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import comparators.TfIdfComparator;
import documents.DocumentId;

/**
* A simplified document indexer and search engine.
*
* Documents are added to the engine one-by-one, and uniquely identified by a DocumentId.
*
* Documents are internally represented as "terms", which are lowercased versions of each word
* in the document.
*
* Queries for terms are also made on the lowercased version of the term. Terms are
* therefore case-insensitive.
*
* Lookups for documents can be done by term, and the most relevant document(s) to a specific term
* (as computed by tf-idf) can also be retrieved.
*
*
*/
public class SearchEngine {

    public Map<String, DocumentId> google = new HashMap<String, DocumentId>();
    /**
     * Inserts a document into the search engine for later analysis and retrieval.
     *
     * The document is uniquely identified by a documentId; attempts to re-insert the same
     * document are ignored.
     *
     * The document is supplied as a Reader; this method stores the document contents for
     * later analysis and retrieval.
     *
     * @param documentId
     * @param reader
     * @throws IOException iff the reader throws an exception
     */

public void addDocument(DocumentId documentId, Reader reader) throws IOException {

String Document = "";

        int x = reader.read();
        while(x != -1){
            Document += (char)x;
            x = reader.read();
        }

Document = Document.toLowerCase();

        DocumentId p = google.get(documentId);
        if(p ==null){
            google.put(Document, documentId);
        }

}

    /**
     * Returns the set of DocumentIds contained within the search engine that contain a given term.
     *
     * @param term
     * @return the set of DocumentIds that contain a given term
     */
    public Set<DocumentId> indexLookup(String term) {

Set<DocumentId> t = new HashSet<DocumentId>();
String k = term.toLowerCase();

        for(String doc: google.keySet()){
            if(doc.contains(k)){
                t.add(google.get(doc));
            }
        }

return t;
}

    /**
     * Returns the term frequency of a term in a particular document.
     *
     * The term frequency is number of times the term appears in a document.
     *
     * See
     * @param documentId
     * @param term
     * @return the term frequency of a term in a particular document
     * @throws IllegalArgumentException if the documentId has not been added to the engine
     */
    public int termFrequency(DocumentId documentId, String term) throws IllegalArgumentException {
        if(!google.containsValue(documentId)){
            throw new IllegalArgumentException();
        }

term.toLowerCase();
String Document = "";

        for(String i: google.keySet()){
            if(google.get(i).equals(documentId)){
                Document = i;
                break;
            }
        }

int count = 0;
int i = Document.indexOf(term);

        while(i !=-1){
            count++;
            Document = Document.substring(i+term.length());
            i = Document.indexOf(term);
        }

return count;

       for(i = Document.indexOf(term); i<Integer.MAX_VALUE; i++){
           if(i!=-1){
               count++;
               Document = Document.substring(i + 1);
               i = Document.indexOf(term);
           }

           else{
           break;
           }
       }

}

    /**
     * Returns the inverse document frequency of a term across all documents in the index.
     *
     * For our purposes, IDF is defined as log ((1 + N) / (1 + M)) where
     * N is the number of documents in total, and M
     * is the number of documents where the term appears.
     *
     * @param term
     * @return the inverse document frequency of term
     */
    public double inverseDocumentFrequency(String term) {

double googlesize = google.size(), mTotal = 0;
term.toLowerCase();

        for(String Document: google.keySet()){
            if(Document.indexOf(term) != -1){
                mTotal++;
            }
        }

        double value = (double)Math.log((double)(1+googlesize) / (double)(1+ mTotal));
        return (double)value;
    }

    /**
     * Returns the tfidf score of a particular term for a particular document.
     *
     * tfidf is the product of term frequency and inverse document frequency for the given term and document.
     *
     * @param documentId
     * @param term
     * @return the tfidf of the the term/document
     * @throws IllegalArgumentException if the documentId has not been added to the engine
     */
    public double tfIdf(DocumentId documentId, String term) throws IllegalArgumentException {

        if(!google.containsValue(documentId)){
            throw new IllegalArgumentException();
        }

return termFrequency(documentId, term) * inverseDocumentFrequency(term);
}

    /**
     * Returns a sorted list of documents, most relevant to least relevant, for the given term.
     *
     * A document with a larger tfidf score is more relevant than a document with a lower tfidf score.
     *
     * Each document in the returned list must contain the term.
     *
     * @param term
     * @return a list of documents sorted in descending order by tfidf
     */
    public List<DocumentId> relevanceLookup(String term) {
        List<DocumentId> Documents = new ArrayList<DocumentId>();
        Set<DocumentId> dc = indexLookup(term);
        TfIdfComparator comparekek = new TfIdfComparator(this, term);

        for(DocumentId h: dc){
            Documents.add(h);
            for(int i = 0; i< Documents.size(); i++){
               if(comparekek.compare(h, Documents.get(i)) <= 0){
                   Documents.add(i, h);
                   break;
               }

               if(i == Documents.size()-1)
                   Documents.add(h);
           if(Documents.size() == 0){
               Documents.add(h);
           }
       }
        }

Documents.sort(comparekek);

return Documents;
}

}

Navigate

package hw2; * Homework assignment 2: Expression trees * * The goal of this assi

package insy.java.hw6; import javax.swing.*; /** A simple class that stores info

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

package index; import java.io.IOException; import java.io.Reader; import java.ut

Question

Explanation / Answer

Related Questions

Navigate