Search in sources :

Example 6 with Query

use of info.ephyra.querygeneration.Query in project lucida by claritylab.

the class WebTermImportanceFilter method main.

public static void main(String[] args) {
    TEST_TARGET_GENERATION = true;
    MsgPrinter.enableStatusMsgs(true);
    MsgPrinter.enableErrorMsgs(true);
    // create tokenizer
    MsgPrinter.printStatusMsg("Creating tokenizer...");
    if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
        MsgPrinter.printErrorMsg("Could not create tokenizer.");
    //		LingPipe.createTokenizer();
    // create sentence detector
    //		MsgPrinter.printStatusMsg("Creating sentence detector...");
    //		if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
    //			MsgPrinter.printErrorMsg("Could not create sentence detector.");
    //		LingPipe.createSentenceDetector();
    // create stemmer
    MsgPrinter.printStatusMsg("Creating stemmer...");
    SnowballStemmer.create();
    // create part of speech tagger
    MsgPrinter.printStatusMsg("Creating POS tagger...");
    if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz", "res/nlp/postagger/opennlp/tagdict"))
        MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
    //		if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
    //				"train-wsj-0-18.holder"))
    //			MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
    // create chunker
    MsgPrinter.printStatusMsg("Creating chunker...");
    if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" + "EnglishChunk.bin.gz"))
        MsgPrinter.printErrorMsg("Could not create chunker.");
    // create named entity taggers
    MsgPrinter.printStatusMsg("Creating NE taggers...");
    NETagger.loadListTaggers("res/nlp/netagger/lists/");
    NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
    MsgPrinter.printStatusMsg("  ...loading models");
    //			MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
    if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
        MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
    MsgPrinter.printStatusMsg("  ...done");
    WebTermImportanceFilter wtif = new TargetGeneratorTest(NO_NORMALIZATION);
    TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
    for (TRECTarget target : targets) {
        String question = target.getTargetDesc();
        // query generation
        MsgPrinter.printGeneratingQueries();
        String qn = QuestionNormalizer.normalize(question);
        // print normalized question string
        MsgPrinter.printNormalization(qn);
        // log normalized question string
        Logger.logNormalization(qn);
        String[] kws = KeywordExtractor.getKeywords(qn);
        AnalyzedQuestion aq = new AnalyzedQuestion(question);
        aq.setKeywords(kws);
        aq.setFactoid(false);
        Query[] queries = new BagOfWordsG().generateQueries(aq);
        for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
        Result[] results = new Result[1];
        results[0] = new Result("This would be the answer", queries[0]);
        wtif.apply(results);
    }
}
Also used : Query(info.ephyra.querygeneration.Query) BagOfWordsG(info.ephyra.querygeneration.generators.BagOfWordsG) Result(info.ephyra.search.Result) TRECTarget(info.ephyra.trec.TRECTarget) AnalyzedQuestion(info.ephyra.questionanalysis.AnalyzedQuestion)

Example 7 with Query

use of info.ephyra.querygeneration.Query in project lucida by claritylab.

the class WebDocumentFetcher method apply.

/**
	 * Fetches the top <code>MAX_DOCS</code> documents containing the given
	 * search engine snippets. The original snippets are dropped.
	 * 
	 * @param results array of <code>Result</code> objects containing snippets
	 * @return array of <code>Result</code> objects containing entire documents
	 */
public Result[] apply(Result[] results) {
    // documents containing the search engine snippets
    docs = new ArrayList<Result>();
    // start document fetchers
    HashSet<String> urls = new HashSet<String>();
    for (Result result : results) {
        // only apply this filter to results for the semantic parsing
        // approach
        Query query = result.getQuery();
        Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
        if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
            continue;
        // if result is not a web document then just make a copy
        if (!result.getDocID().contains(":")) {
            Result newResult = result.getCopy();
            newResult.setScore(0);
            docs.add(newResult);
            continue;
        }
        // fetch at most MAX_DOCS documents
        if (urls.size() >= MAX_DOCS)
            break;
        String url = result.getDocID();
        // no forbidden document type
        if (url.matches("(?i).*?" + FORBIDDEN_DOCS))
            continue;
        // only HTTP connections
        try {
            URLConnection conn = (new URL(url)).openConnection();
            if (!(conn instanceof HttpURLConnection))
                continue;
        } catch (IOException e) {
            continue;
        }
        // no duplicate document
        if (!urls.add(url))
            continue;
        // if caching is enabled, try to read document from cache
        if (CACHING) {
            FileCache cache = new FileCache(CACHE_DIR);
            String[] entries = cache.read(url);
            if (entries != null) {
                StringBuilder sb = new StringBuilder();
                for (String entry : entries) {
                    sb.append(entry);
                    sb.append("\n");
                }
                String docText = sb.toString();
                Result doc = new Result(docText, result.getQuery(), url, result.getHitPos());
                doc.setScore(0);
                docs.add(doc);
                continue;
            }
        }
        (new WebDocumentFetcher()).start(this, result);
    }
    // wait until all fetchers are done
    waitForDocs();
    // keep old results
    Result[] newResults = docs.toArray(new Result[docs.size()]);
    Result[] allResults = new Result[results.length + newResults.length];
    for (int i = 0; i < results.length; i++) allResults[i] = results[i];
    for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
    return allResults;
}
Also used : Query(info.ephyra.querygeneration.Query) IOException(java.io.IOException) HttpURLConnection(java.net.HttpURLConnection) URLConnection(java.net.URLConnection) URL(java.net.URL) Result(info.ephyra.search.Result) Predicate(info.ephyra.nlp.semantics.Predicate) FileCache(info.ephyra.util.FileCache) HttpURLConnection(java.net.HttpURLConnection) HashSet(java.util.HashSet)

Example 8 with Query

use of info.ephyra.querygeneration.Query in project lucida by claritylab.

the class FactoidsFromPredicatesFilter method apply.

/**
	 * Extracts factoids from the predicates withing the answer strings of the
	 * <code>Result</code> objects and creates a new <code>Result</code> for
	 * each extracted unique answer.
	 * 
	 * @param results array of <code>Result</code> objects containing predicates
	 * @return array of <code>Result</code> objects containing factoids
	 */
public Result[] apply(Result[] results) {
    // old results that are passed along the pipeline
    ArrayList<Result> oldResults = new ArrayList<Result>();
    // extracted factoid answers and corresponding results
    Hashtable<String, Result> factoids = new Hashtable<String, Result>();
    // extracted factoid answers and maximum weights of predicates
    Hashtable<String, Double> maxScores = new Hashtable<String, Double>();
    for (Result result : results) {
        // only apply this filter to results for the semantic parsing
        // approach
        Query query = result.getQuery();
        Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
        if (!query.extractWith(ID) || ps.length == 0 || result.getScore() != 0) {
            oldResults.add(result);
            continue;
        }
        Predicate p = result.getPredicate();
        Predicate questionP = p.getSimPredicate();
        double simScore = p.getSimScore();
        Map<String, String[]> nes = result.getNes();
        // get answer strings
        ArrayList<String> answers = new ArrayList<String>();
        if (nes != null) {
            // - allow entities in all arguments
            for (String ne : nes.keySet()) for (String arg : p.getArgs()) if (arg.contains(ne)) {
                answers.add(ne);
                break;
            }
        // - allow entities in missing arguments only
        //				for (String ne : nes.keySet())
        //					for (String missing : questionP.getMissingArgs()) {
        //						String arg = p.get(missing);
        //						if (arg != null && arg.contains(ne)) {
        //							answers.add(ne);
        //							break;
        //						}
        //					}
        } else {
            // arguments as factoid answers
            for (String missing : questionP.getMissingArgs()) {
                String arg = p.get(missing);
                if (arg != null)
                    answers.add(arg);
            }
        }
        // create result objects
        for (String answer : answers) {
            String norm = StringUtils.normalize(answer);
            Result factoid = factoids.get(norm);
            if (factoid == null) {
                // new answer
                // query, doc ID and sentence can be ambiguous
                factoid = new Result(answer, result.getQuery(), result.getDocID());
                factoid.setSentence(result.getSentence());
                factoid.addExtractionTechnique(ID);
                factoids.put(norm, factoid);
                maxScores.put(norm, simScore);
            } else if (simScore > maxScores.get(norm)) {
                // remember document ID of predicate with highest score
                factoid.setDocID(result.getDocID());
                maxScores.put(norm, simScore);
            }
            if (nes != null)
                for (String neType : nes.get(answer)) factoid.addNeType(neType);
            factoid.incScore((float) simScore);
        }
    }
    // keep old results
    Result[] newResults = factoids.values().toArray(new Result[factoids.size()]);
    Result[] allResults = new Result[oldResults.size() + newResults.length];
    oldResults.toArray(allResults);
    for (int i = 0; i < newResults.length; i++) allResults[oldResults.size() + i] = newResults[i];
    return allResults;
}
Also used : Query(info.ephyra.querygeneration.Query) Hashtable(java.util.Hashtable) ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result) Predicate(info.ephyra.nlp.semantics.Predicate)

Example 9 with Query

use of info.ephyra.querygeneration.Query in project lucida by claritylab.

the class AnswerTypeFilter method apply.

/**
	 * Extracts NEs of particular types from the answer strings of the
	 * <code>Result</code> objects and creates a new <code>Result</code> for
	 * each extracted unique answer.
	 * 
	 * @param results array of <code>Result</code> objects
	 * @return extended array of <code>Result</code> objects
	 */
public Result[] apply(Result[] results) {
    // extracted factoid answers and corresponding results
    Hashtable<String, Result> factoids = new Hashtable<String, Result>();
    for (Result result : results) {
        // only apply this filter to results for the answer type testing
        // approach
        Query query = result.getQuery();
        String[] answerTypes = query.getAnalyzedQuestion().getAnswerTypes();
        if (!query.extractWith(ID) || answerTypes.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
            continue;
        // split answer string into sentences and tokenize sentences
        String answer = result.getAnswer();
        String[] sentences = OpenNLP.sentDetect(answer);
        String[][] tokens = new String[sentences.length][];
        for (int i = 0; i < sentences.length; i++) tokens[i] = NETagger.tokenize(sentences[i]);
        for (String answerType : answerTypes) {
            // get IDs of the taggers for the most specific NE type that can
            // be tagged
            String[] neTypes = answerType.split("->");
            int[] neIds = new int[0];
            for (String neType : neTypes) {
                int[] thisIds = NETagger.getNeIds(neType);
                if (thisIds.length > 0)
                    neIds = thisIds;
            }
            // extract NEs of that type
            for (int neId : neIds) {
                String neType = NETagger.getNeType(neId);
                String[][] nes = NETagger.extractNes(tokens, neId);
                for (int i = 0; i < sentences.length; i++) {
                    // untokenize NEs
                    for (int j = 0; j < nes[i].length; j++) nes[i][j] = OpenNLP.untokenize(nes[i][j], sentences[i]);
                    // create new result for each unique normalized NE
                    for (String ne : nes[i]) {
                        String norm = StringUtils.normalize(ne);
                        Result factoid = factoids.get(norm);
                        if (factoid == null) {
                            // new answer
                            // query, doc ID and sentence can be ambiguous
                            factoid = new Result(ne, result.getQuery(), result.getDocID());
                            factoid.setSentence(sentences[i]);
                            factoid.addExtractionTechnique(ID);
                            factoids.put(norm, factoid);
                        }
                        factoid.addNeType(neType);
                        factoid.incScore(1);
                    // TODO consider query score, #keywords, hit pos
                    }
                }
            }
        }
    }
    // keep old results
    Result[] newResults = factoids.values().toArray(new Result[factoids.size()]);
    Result[] allResults = new Result[results.length + newResults.length];
    for (int i = 0; i < results.length; i++) allResults[i] = results[i];
    for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
    return allResults;
}
Also used : Query(info.ephyra.querygeneration.Query) Hashtable(java.util.Hashtable) Result(info.ephyra.search.Result)

Example 10 with Query

use of info.ephyra.querygeneration.Query in project lucida by claritylab.

the class PredicateExtractionFilter method apply.

/**
	 * Extracts relevant predicates from documents.
	 * 
	 * @param results array of <code>Result</code> objects containing documents
	 * @return array of <code>Result</code> objects containing predicates
	 */
public Result[] apply(Result[] results) {
    if (results.length == 0)
        return results;
    ArrayList<Result> allResults = new ArrayList<Result>();
    // extract relevant sentences
    // - get sentences that contain relevant verbs,
    //   use weights of verbs as confidence scores
    HashSet<Result> ssSet = new HashSet<Result>();
    for (Result result : results) {
        // only apply this filter to results for the semantic parsing
        // approach
        Query query = result.getQuery();
        Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
        if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() != 0) {
            allResults.add(result);
            continue;
        }
        // get all verb forms and build patterns
        Hashtable<String[], Double> verbFormsMap = getAllVerbForms(ps);
        ArrayList<String> verbPatterns = new ArrayList<String>();
        ArrayList<Double> verbWeights = new ArrayList<Double>();
        for (String[] verbForms : verbFormsMap.keySet()) {
            String verbPattern = "(?i).*?\\b(" + StringUtils.concat(verbForms, "|") + ")\\b.*+";
            verbPatterns.add(verbPattern);
            verbWeights.add(verbFormsMap.get(verbForms));
        }
        String[] paragraphs = result.getAnswer().split("\\n");
        for (String p : paragraphs) {
            // paragraph does not contain relevant verb?
            boolean contains = false;
            for (String verbPattern : verbPatterns) {
                if (p.matches(verbPattern)) {
                    contains = true;
                    break;
                }
            }
            if (!contains)
                continue;
            String[] sentences = LingPipe.sentDetect(p);
            for (String s : sentences) {
                // sentence does not contain relevant verb?
                Double weight = 0d;
                for (int i = 0; i < verbPatterns.size(); i++) {
                    if (s.matches(verbPatterns.get(i))) {
                        weight = verbWeights.get(i);
                        break;
                    }
                }
                if (weight == 0d)
                    continue;
                // replace whitespaces by single blanks and trim
                s = s.replaceAll("\\s++", " ").trim();
                // create sentence-level result object
                Result sentence = result.getCopy();
                sentence.setAnswer(s);
                sentence.setScore(weight.floatValue());
                ssSet.add(sentence);
            }
        }
    }
    // - check if these sentences are relevant,
    //   get MAX_SENTENCES sentences with most relevant verbs
    Result[] ss = ssSet.toArray(new Result[ssSet.size()]);
    ss = (new ScoreSorterFilter()).apply(ss);
    ArrayList<Result> ssList = new ArrayList<Result>();
    for (Result s : ss) {
        s.setScore(0);
        if (checkSentence(s))
            ssList.add(s);
        // get at most MAX_SENTENCES sentences
        if (ssList.size() >= MAX_SENTENCES)
            break;
    }
    ss = ssList.toArray(new Result[ssList.size()]);
    if (ss.length == 0)
        return allResults.toArray(new Result[allResults.size()]);
    // annotate predicates in sentences
    String[] sentences = new String[ss.length];
    for (int i = 0; i < ss.length; i++) sentences[i] = ss[i].getAnswer();
    String[][] ass = ASSERT.annotatePredicates(sentences);
    // extract predicates from annotations
    for (int i = 0; i < ass.length; i++) {
        Term[] terms = ss[i].getTerms();
        Predicate[] questionPs = ss[i].getQuery().getAnalyzedQuestion().getPredicates();
        for (int j = 0; j < ass[i].length; j++) {
            // build predicate
            Predicate predicate = null;
            try {
                predicate = new Predicate(sentences[i], ass[i][j], terms);
            } catch (ParseException e) {
                //					System.exit(1);
                continue;
            }
            // calculate similarity score
            double simScore = 0;
            Predicate simPredicate = null;
            for (Predicate questionP : questionPs) // compare to predicates with missing arguments only
            if (questionP.hasMissingArgs()) {
                double currSimScore = predicate.simScore(questionP);
                if (currSimScore > simScore) {
                    simScore = currSimScore;
                    simPredicate = questionP;
                }
            }
            // keep predicate if it is similar to a question predicate
            if (simScore > 0) {
                predicate.setSimScore(simScore);
                predicate.setSimPredicate(simPredicate);
                Result result = ss[i].getCopy();
                result.setAnswer(ass[i][j]);
                result.setSentence(sentences[i]);
                result.setPredicate(predicate);
                allResults.add(result);
            }
        }
    }
    return allResults.toArray(new Result[allResults.size()]);
}
Also used : Query(info.ephyra.querygeneration.Query) ArrayList(java.util.ArrayList) Term(info.ephyra.questionanalysis.Term) Result(info.ephyra.search.Result) Predicate(info.ephyra.nlp.semantics.Predicate) ParseException(java.text.ParseException) HashSet(java.util.HashSet)

Aggregations

Query (info.ephyra.querygeneration.Query)19 Result (info.ephyra.search.Result)11 ArrayList (java.util.ArrayList)8 Predicate (info.ephyra.nlp.semantics.Predicate)6 QuestionInterpretation (info.ephyra.questionanalysis.QuestionInterpretation)4 Term (info.ephyra.questionanalysis.Term)4 IOException (java.io.IOException)4 BagOfWordsG (info.ephyra.querygeneration.generators.BagOfWordsG)3 AnalyzedQuestion (info.ephyra.questionanalysis.AnalyzedQuestion)3 Hashtable (java.util.Hashtable)3 TRECTarget (info.ephyra.trec.TRECTarget)2 BufferedReader (java.io.BufferedReader)2 FileReader (java.io.FileReader)2 HashSet (java.util.HashSet)2 QuestionReformulator (info.ephyra.querygeneration.QuestionReformulator)1 QuestionInterpretationG (info.ephyra.querygeneration.generators.QuestionInterpretationG)1 FileCache (info.ephyra.util.FileCache)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 PrintWriter (java.io.PrintWriter)1