use of info.ephyra.util.FileCache in project lucida by claritylab.
the class ASSERT method annotatePredicates.
/**
* Annotates the predicates in an array of sentences.
*
* @param ss sentences to be parsed
* @return annotated sentences
*/
public static String[][] annotatePredicates(String[] ss) {
// drop special characters that ASSERT cannot handle
Pattern p = Pattern.compile(".++");
for (int i = 0; i < ss.length; i++) {
String noSpecChar = "";
Matcher m = p.matcher(ss[i]);
while (m.find()) noSpecChar += " " + m.group(0);
ss[i] = noSpecChar.trim();
}
// if caching is enabled, try to read parses from cache
// parses from both cache and ASSERT
String[][] allParses = new String[ss.length][];
// used to merge parses from cache and ASSERT
ArrayList<Integer> originalIndices = new ArrayList<Integer>();
if (CACHING) {
FileCache cache = new FileCache(CACHE_DIR);
// sentences that are not in the cache
ArrayList<String> notInCache = new ArrayList<String>();
for (int i = 0; i < ss.length; i++) {
String[] parses = cache.read(ss[i]);
if (parses != null)
allParses[i] = parses;
else {
notInCache.add(ss[i]);
originalIndices.add(i);
}
}
ss = notInCache.toArray(new String[notInCache.size()]);
}
// get missing parses from ASSERT
String[][] parses = new String[ss.length][];
if (ss.length > 0 && ASSERT_DIR != null && ASSERT_DIR.length() > 0) {
try {
MsgPrinter.printStatusMsgTimestamp("Parsing " + ss.length + " sentences with ASSERT...");
int beginIndex = 0;
while (beginIndex < ss.length) {
// restart ASSERT if it crashed
// copy sentences that have not been parsed yet
String[] sentences = new String[ss.length - beginIndex];
for (int i = 0; i < sentences.length; i++) sentences[i] = ss[i + beginIndex];
// parse these sentences
File input = createInputFile(sentences);
File logf = execAssertProcess(input);
String[][] output = readOutputFile(input, ss.length);
// merge parses in one array
int lastIndex = checkLogFile(logf);
if (lastIndex > -1 && lastIndex < Integer.MAX_VALUE) {
MsgPrinter.printErrorMsg("ASSERT could not parse sentence:\n" + sentences[lastIndex]);
output[lastIndex] = null;
} else if (lastIndex == Integer.MAX_VALUE) {
lastIndex = sentences.length - 1;
}
lastIndex = beginIndex + lastIndex;
for (int i = beginIndex; i <= lastIndex; i++) parses[i] = output[i - beginIndex];
beginIndex = lastIndex + 1;
}
MsgPrinter.printStatusMsgTimestamp("...done");
} catch (Exception e) {
MsgPrinter.printErrorMsg("\nCould not call ASSERT:\n" + e.getMessage());
System.exit(1);
}
}
// if caching is enabled, write new parses to cache and merge parses from cache and ASSERT
if (CACHING) {
FileCache cache = new FileCache(CACHE_DIR);
for (int i = 0; i < parses.length; i++) {
// write to cache
if (parses[i] != null)
cache.write(ss[i], parses[i]);
// merge with results from cache
allParses[originalIndices.get(i)] = parses[i];
}
} else {
allParses = parses;
}
// return an empty array for sentences that could not be parsed
for (int i = 0; i < allParses.length; i++) if (allParses[i] == null)
allParses[i] = new String[0];
return allParses;
}
use of info.ephyra.util.FileCache in project lucida by claritylab.
the class WebDocumentFetcher method apply.
/**
* Fetches the top <code>MAX_DOCS</code> documents containing the given
* search engine snippets. The original snippets are dropped.
*
* @param results array of <code>Result</code> objects containing snippets
* @return array of <code>Result</code> objects containing entire documents
*/
public Result[] apply(Result[] results) {
// documents containing the search engine snippets
docs = new ArrayList<Result>();
// start document fetchers
HashSet<String> urls = new HashSet<String>();
for (Result result : results) {
// only apply this filter to results for the semantic parsing
// approach
Query query = result.getQuery();
Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
continue;
// if result is not a web document then just make a copy
if (!result.getDocID().contains(":")) {
Result newResult = result.getCopy();
newResult.setScore(0);
docs.add(newResult);
continue;
}
// fetch at most MAX_DOCS documents
if (urls.size() >= MAX_DOCS)
break;
String url = result.getDocID();
// no forbidden document type
if (url.matches("(?i).*?" + FORBIDDEN_DOCS))
continue;
// only HTTP connections
try {
URLConnection conn = (new URL(url)).openConnection();
if (!(conn instanceof HttpURLConnection))
continue;
} catch (IOException e) {
continue;
}
// no duplicate document
if (!urls.add(url))
continue;
// if caching is enabled, try to read document from cache
if (CACHING) {
FileCache cache = new FileCache(CACHE_DIR);
String[] entries = cache.read(url);
if (entries != null) {
StringBuilder sb = new StringBuilder();
for (String entry : entries) {
sb.append(entry);
sb.append("\n");
}
String docText = sb.toString();
Result doc = new Result(docText, result.getQuery(), url, result.getHitPos());
doc.setScore(0);
docs.add(doc);
continue;
}
}
(new WebDocumentFetcher()).start(this, result);
}
// wait until all fetchers are done
waitForDocs();
// keep old results
Result[] newResults = docs.toArray(new Result[docs.size()]);
Result[] allResults = new Result[results.length + newResults.length];
for (int i = 0; i < results.length; i++) allResults[i] = results[i];
for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
return allResults;
}
use of info.ephyra.util.FileCache in project lucida by claritylab.
the class WebDocumentFetcher method addDoc.
/**
* Used by the <code>WebDocumentFetcher</code> threads to return the
* documents.
*
* @param doc document that contains a snippet
* @param cached flag indicating that the document was fetched from the
* search engine cache
*/
public void addDoc(Result doc, boolean cached) {
synchronized (docs) {
if (doc != null) {
docs.add(doc);
// engine cache, write document to local cache
if (CACHING && !cached) {
FileCache cache = new FileCache(CACHE_DIR);
cache.write(doc.getDocID(), new String[] { doc.getAnswer() });
}
}
pending--;
// signal that the fetcher is done
docs.notify();
}
}
Aggregations