Search in sources :

Example 1 with FileCache

use of info.ephyra.util.FileCache in project lucida by claritylab.

the class ASSERT method annotatePredicates.

/**
	 * Annotates the predicates in an array of sentences.
	 * 
	 * @param ss sentences to be parsed
	 * @return annotated sentences
	 */
public static String[][] annotatePredicates(String[] ss) {
    // drop special characters that ASSERT cannot handle
    Pattern p = Pattern.compile(".++");
    for (int i = 0; i < ss.length; i++) {
        String noSpecChar = "";
        Matcher m = p.matcher(ss[i]);
        while (m.find()) noSpecChar += " " + m.group(0);
        ss[i] = noSpecChar.trim();
    }
    // if caching is enabled, try to read parses from cache
    // parses from both cache and ASSERT
    String[][] allParses = new String[ss.length][];
    // used to merge parses from cache and ASSERT
    ArrayList<Integer> originalIndices = new ArrayList<Integer>();
    if (CACHING) {
        FileCache cache = new FileCache(CACHE_DIR);
        // sentences that are not in the cache
        ArrayList<String> notInCache = new ArrayList<String>();
        for (int i = 0; i < ss.length; i++) {
            String[] parses = cache.read(ss[i]);
            if (parses != null)
                allParses[i] = parses;
            else {
                notInCache.add(ss[i]);
                originalIndices.add(i);
            }
        }
        ss = notInCache.toArray(new String[notInCache.size()]);
    }
    // get missing parses from ASSERT
    String[][] parses = new String[ss.length][];
    if (ss.length > 0 && ASSERT_DIR != null && ASSERT_DIR.length() > 0) {
        try {
            MsgPrinter.printStatusMsgTimestamp("Parsing " + ss.length + " sentences with ASSERT...");
            int beginIndex = 0;
            while (beginIndex < ss.length) {
                // restart ASSERT if it crashed
                // copy sentences that have not been parsed yet
                String[] sentences = new String[ss.length - beginIndex];
                for (int i = 0; i < sentences.length; i++) sentences[i] = ss[i + beginIndex];
                // parse these sentences
                File input = createInputFile(sentences);
                File logf = execAssertProcess(input);
                String[][] output = readOutputFile(input, ss.length);
                // merge parses in one array
                int lastIndex = checkLogFile(logf);
                if (lastIndex > -1 && lastIndex < Integer.MAX_VALUE) {
                    MsgPrinter.printErrorMsg("ASSERT could not parse sentence:\n" + sentences[lastIndex]);
                    output[lastIndex] = null;
                } else if (lastIndex == Integer.MAX_VALUE) {
                    lastIndex = sentences.length - 1;
                }
                lastIndex = beginIndex + lastIndex;
                for (int i = beginIndex; i <= lastIndex; i++) parses[i] = output[i - beginIndex];
                beginIndex = lastIndex + 1;
            }
            MsgPrinter.printStatusMsgTimestamp("...done");
        } catch (Exception e) {
            MsgPrinter.printErrorMsg("\nCould not call ASSERT:\n" + e.getMessage());
            System.exit(1);
        }
    }
    // if caching is enabled, write new parses to cache and merge parses from cache and ASSERT
    if (CACHING) {
        FileCache cache = new FileCache(CACHE_DIR);
        for (int i = 0; i < parses.length; i++) {
            // write to cache
            if (parses[i] != null)
                cache.write(ss[i], parses[i]);
            // merge with results from cache
            allParses[originalIndices.get(i)] = parses[i];
        }
    } else {
        allParses = parses;
    }
    // return an empty array for sentences that could not be parsed
    for (int i = 0; i < allParses.length; i++) if (allParses[i] == null)
        allParses[i] = new String[0];
    return allParses;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileCache(info.ephyra.util.FileCache) File(java.io.File)

Example 2 with FileCache

use of info.ephyra.util.FileCache in project lucida by claritylab.

the class WebDocumentFetcher method apply.

/**
	 * Fetches the top <code>MAX_DOCS</code> documents containing the given
	 * search engine snippets. The original snippets are dropped.
	 * 
	 * @param results array of <code>Result</code> objects containing snippets
	 * @return array of <code>Result</code> objects containing entire documents
	 */
public Result[] apply(Result[] results) {
    // documents containing the search engine snippets
    docs = new ArrayList<Result>();
    // start document fetchers
    HashSet<String> urls = new HashSet<String>();
    for (Result result : results) {
        // only apply this filter to results for the semantic parsing
        // approach
        Query query = result.getQuery();
        Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
        if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
            continue;
        // if result is not a web document then just make a copy
        if (!result.getDocID().contains(":")) {
            Result newResult = result.getCopy();
            newResult.setScore(0);
            docs.add(newResult);
            continue;
        }
        // fetch at most MAX_DOCS documents
        if (urls.size() >= MAX_DOCS)
            break;
        String url = result.getDocID();
        // no forbidden document type
        if (url.matches("(?i).*?" + FORBIDDEN_DOCS))
            continue;
        // only HTTP connections
        try {
            URLConnection conn = (new URL(url)).openConnection();
            if (!(conn instanceof HttpURLConnection))
                continue;
        } catch (IOException e) {
            continue;
        }
        // no duplicate document
        if (!urls.add(url))
            continue;
        // if caching is enabled, try to read document from cache
        if (CACHING) {
            FileCache cache = new FileCache(CACHE_DIR);
            String[] entries = cache.read(url);
            if (entries != null) {
                StringBuilder sb = new StringBuilder();
                for (String entry : entries) {
                    sb.append(entry);
                    sb.append("\n");
                }
                String docText = sb.toString();
                Result doc = new Result(docText, result.getQuery(), url, result.getHitPos());
                doc.setScore(0);
                docs.add(doc);
                continue;
            }
        }
        (new WebDocumentFetcher()).start(this, result);
    }
    // wait until all fetchers are done
    waitForDocs();
    // keep old results
    Result[] newResults = docs.toArray(new Result[docs.size()]);
    Result[] allResults = new Result[results.length + newResults.length];
    for (int i = 0; i < results.length; i++) allResults[i] = results[i];
    for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
    return allResults;
}
Also used : Query(info.ephyra.querygeneration.Query) IOException(java.io.IOException) HttpURLConnection(java.net.HttpURLConnection) URLConnection(java.net.URLConnection) URL(java.net.URL) Result(info.ephyra.search.Result) Predicate(info.ephyra.nlp.semantics.Predicate) FileCache(info.ephyra.util.FileCache) HttpURLConnection(java.net.HttpURLConnection) HashSet(java.util.HashSet)

Example 3 with FileCache

use of info.ephyra.util.FileCache in project lucida by claritylab.

the class WebDocumentFetcher method addDoc.

/**
	 * Used by the <code>WebDocumentFetcher</code> threads to return the
	 * documents.
	 * 
	 * @param doc document that contains a snippet
	 * @param cached flag indicating that the document was fetched from the
	 *               search engine cache
	 */
public void addDoc(Result doc, boolean cached) {
    synchronized (docs) {
        if (doc != null) {
            docs.add(doc);
            // engine cache, write document to local cache
            if (CACHING && !cached) {
                FileCache cache = new FileCache(CACHE_DIR);
                cache.write(doc.getDocID(), new String[] { doc.getAnswer() });
            }
        }
        pending--;
        // signal that the fetcher is done
        docs.notify();
    }
}
Also used : FileCache(info.ephyra.util.FileCache)

Aggregations

FileCache (info.ephyra.util.FileCache)3 IOException (java.io.IOException)2 Predicate (info.ephyra.nlp.semantics.Predicate)1 Query (info.ephyra.querygeneration.Query)1 Result (info.ephyra.search.Result)1 File (java.io.File)1 HttpURLConnection (java.net.HttpURLConnection)1 URL (java.net.URL)1 URLConnection (java.net.URLConnection)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1