Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

Your task is to design and implement an algorithm that searches a collection of

ID: 658521 • Letter: Y

Question

Your task is to design and implement an algorithm that searches a collection of documents. You have the freedom to select the data structures and algorithms that you consider to be more efficient for this task. Of course, you will have to justify your decisions.

First, you will process the documents and store their content(i.e. words) in the data structures that you choose.

Second, each input query is a word. For each query, you will search it in the documents, using the previously implemented data structures and an algorithm of your choice.

Third, for each such query, you will have to display the documents that contain the query. The documents should be outputted in the order of query occurrence times in each document. The document containing the most query words should be the first to display.

Example

Consider the following sample documents.

Doc1:

I like the class on data structures and algorithms. This is a useful class.

Doc2:

I hate the class on data structures and algorithms.

Doc3:

Interesting statistical data may result from this survey.

Sample query: class

For the sample query, the output should be:

Doc1:

I like the class on data structures and algorithms. This is a useful class.

Doc2:

I hate the class on data structures and algorithms.

Note that Doc1 comes before Doc 2 because

Explanation / Answer

package net.briandupreez.pci.chapter4;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.kernel.impl.util.FileUtils;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class CreateDBFactory {
private static GraphDatabaseService graphDb = null;
public static final String RESOURCES_CRAWL_DB = "resources/crawl/db";
public static GraphDatabaseService createInMemoryDB() {

if (null == graphDb) {

synchronized (GraphDatabaseService.class) {

if (null == graphDb) {

final Map<String, String> config = new HashMap<>();

config.put("neostore.nodestore.db.mapped_memory", "50M");

config.put("string_block_size", "60");

config.put("array_block_size", "300");

graphDb = new GraphDatabaseFactory()

.newEmbeddedDatabaseBuilder(RESOURCES_CRAWL_DB)

.setConfig(config)

.newGraphDatabase();

registerShutdownHook(graphDb);

}

}

}

return graphDb;

}

private static void registerShutdownHook(final GraphDatabaseService graphDb) {

Runtime.getRuntime().addShutdownHook(new Thread() {

@Override

public void run() {

graphDb.shutdown();

}

});

}

public static void clearDb() {

try {

if(graphDb != null){

graphDb.shutdown();
graphDb = null;
}

FileUtils.deleteRecursively(new File(RESOURCES_CRAWL_DB));

} catch (final IOException e) {

throw new RuntimeException(e);
}
}

}
package net.briandupreez.pci.chapter4;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.index.Index;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class Neo4JWebCrawler extends WebCrawler {

private final GraphDatabaseService graphDb;

public Neo4JWebCrawler() {

this.graphDb = CreateDBFactory.createInMemoryDB();

}

@Override

public boolean shouldVisit(final WebURL url) {

final String href = url.getURL().toLowerCase();

return !NodeConstants.FILTERS.matcher(href).matches();

}


@Override

public void visit(final Page page) {

final String url = page.getWebURL().getURL();

System.out.println("URL: " + url);

final Index<Node> nodeIndex = graphDb.index().forNodes(NodeConstants.PAGE_INDEX);

if (page.getParseData() instanceof HtmlParseData) {

HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

String text = htmlParseData.getText();

List<WebURL> links = htmlParseData.getOutgoingUrls();

Transaction tx = graphDb.beginTx();

try {


final Node pageNode = graphDb.createNode();

pageNode.setProperty(NodeConstants.URL, url);

nodeIndex.add(pageNode, NodeConstants.URL, url);


final List<String> words = cleanAndSplitString(text);

int index = 0;

for (final String word : words) {

final Node wordNode = graphDb.createNode();

wordNode.setProperty(NodeConstants.WORD, word);

wordNode.setProperty(NodeConstants.INDEX, index++);

final Relationship relationship = pageNode.createRelationshipTo(wordNode, RelationshipTypes.CONTAINS);

relationship.setProperty(NodeConstants.SOURCE, url);

}


for (final WebURL webURL : links) {

System.out.println("Linking to " + webURL);

final Node linkNode = graphDb.createNode();

linkNode.setProperty(NodeConstants.URL, webURL.getURL());

final Relationship relationship = pageNode.createRelationshipTo(linkNode, RelationshipTypes.LINK_TO);

relationship.setProperty(NodeConstants.SOURCE, url);

relationship.setProperty(NodeConstants.DESTINATION, webURL.getURL());

}


tx.success();

} finally {

tx.finish();

}
}

}
private static List<String> cleanAndSplitString(final String input) {

if (input != null) {

final String[] dic = input.toLowerCase().replaceAll("\p{Punct}", "").replaceAll("\p{Digit}", "").split("\s+");

return Arrays.asList(dic);

}

return new ArrayList<>();

}

}

final ExecutorService executorService = Executors.newFixedThreadPool(4);
final String[] searchTerms = {"java", "spring"}
List<Callable<TaskResponse>> tasks = new ArrayList<>();
tasks.add(new WordFrequencyTask(searchTerms));
tasks.add(new DocumentLocationTask(searchTerms));
tasks.add(new PageRankTask(searchTerms));
tasks.add(new NeuralNetworkTask(searchTerms));
final List<Future<TaskResponse>> results = executorService.invokeAll(tasks);

public class PageRankTask extends SearchTask implements Callable<TaskResponse> {


public PageRankTask(final String... terms) {

super(terms);

}

@Override

protected ExecutionResult executeQuery(final String... words) {

final ExecutionEngine engine = new ExecutionEngine(graphDb);

final StringBuilder bob = new StringBuilder("START page=node(*) MATCH (page)-[:CONTAINS]->words ");

bob.append(", (page)-[:LINK_TO]->related ");

bob.append("WHERE words.word in [");

bob.append(formatArray(words));

bob.append("] ");

bob.append("RETURN DISTINCT page, related");

return engine.execute(bob.toString());

}

public TaskResponse call() {

final ExecutionResult result = executeQuery(searchTerms);

final Map<String, Double> returnMap = convertToUrlTotalWords(result);

final TaskResponse response = new TaskResponse();

response.taskClazz = this.getClass();

response.resultMap = NormalizationFunctions.normalizeMap(returnMap, true);

return response;

}

private Map<String, Double> convertToUrlTotalWords(final ExecutionResult result) {

final Map<String, Double> uniqueUrls = new HashMap<>();

final Graph g = new SingleGraph("rank", false, true);

final Iterator<Node> pageIterator = result.columnAs("related");

while (pageIterator.hasNext()) {

final Node node = pageIterator.next();

final Iterator<Relationship> relationshipIterator = node.getRelationships().iterator();

while (relationshipIterator.hasNext()) {

final Relationship relationship = relationshipIterator.next();

final String source = relationship.getProperty(NodeConstants.SOURCE).toString();

uniqueUrls.put(source, 0.0);

final String destination = relationship.getProperty(NodeConstants.DESTINATION).toString();

g.addEdge(String.valueOf(node.getId()), source, destination, true);

}

}

computeAndSetPageRankScores(uniqueUrls, g);

return uniqueUrls;

}

private void computeAndSetPageRankScores(final Map<String, Double> uniqueUrls, final Graph graph) {

final PageRank pr = new PageRank();

pr.init(graph);

pr.compute();


for (final Map.Entry<String, Double> entry : uniqueUrls.entrySet()) {

final double score = 100 * pr.getRank(graph.getNode(entry.getKey()));

entry.setValue(score);

}

}

}

package net.briandupreez.pci.chapter4;

import java.util.*;

public class MapUtil {

public static <K, V extends Comparable<? super V>> List<Map.Entry<K, V>> entriesSortedByValues(final Map<K, V> map, final boolean ascending) {

final List<Map.Entry<K, V>> sortedEntries = new ArrayList<>(map.entrySet());

Collections.sort(sortedEntries,

new Comparator<Map.Entry<K, V>>() {
@Override

public int compare(final Map.Entry<K, V> e1, final Map.Entry<K, V> e2) {

if (ascending) {

return e1.getValue().compareTo(e2.getValue());

} else {
return e2.getValue().compareTo(e1.getValue());

}
}

}

);

return sortedEntries;

}

}

<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>14.0.1</version>
</dependency>
<dependency>
<groupId>org.encog</groupId>
<artifactId>encog-core</artifactId>
<version>3.2.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>3.5</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.neo4j</groupId>
<artifactId>neo4j</artifactId>
<version>1.9</version>
</dependency>
<dependency>
<groupId>org.graphstream</groupId>
<artifactId>gs-algo</artifactId>
<version>1.1.2</version>
</dependency>