Search in sources :

Example 1 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.

the class TaxonMatcher method extractorImpl.

/**
     * Implementation details -- use with or without the formal ID/buffer
     * pairing.
     *
     * @param id
     *            doc id
     * @param buf
     *            input text
     * @return list of matches
     * @throws ExtractionException
     */
private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException {
    List<TextMatch> matches = new ArrayList<TextMatch>();
    String docid = (id != null ? id : NO_DOC_ID);
    Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
    QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap);
    @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
    log.debug("TAGS SIZE = {}", tags.size());
    /*
         * Retrieve all offsets into a long list.
         */
    TaxonMatch m = null;
    // int x1 = -1, x2 = -1;
    int tag_count = 0;
    String id_prefix = docid + "#";
    for (NamedList<?> tag : tags) {
        m = new TaxonMatch();
        m.start = ((Integer) tag.get("startOffset")).intValue();
        // +1 char after
        m.end = ((Integer) tag.get("endOffset")).intValue();
        // last matched
        // m.pattern_id = "taxtag";
        ++tag_count;
        m.match_id = id_prefix + tag_count;
        // m.setText((String) tag.get("matchText")); // Not reliable.
        // matchText can be null.
        m.setText(buf.substring(m.start, m.end));
        if (TextUtils.countFormattingSpace(m.getText()) > 1) {
            // Phrase with a single TAB is okay
            continue;
        }
        @SuppressWarnings("unchecked") List<Integer> taxonIDs = (List<Integer>) tag.get("ids");
        for (Integer solrId : taxonIDs) {
            Object refData = beanMap.get(solrId);
            if (refData == null) {
                continue;
            }
            /*
                 * Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO'
                 */
            Taxon tx = (Taxon) refData;
            if (this.filterNonAcronyms) {
                if (tx.isAcronym && !m.isUpper()) {
                    continue;
                }
            }
            m.addTaxon(tx);
        }
        //
        if (m.hasTaxons()) {
            matches.add(m);
        }
    }
    log.debug("FOUND LABELS count={}", matches.size());
    return matches;
}
Also used : HashMap(java.util.HashMap) NamedList(org.apache.solr.common.util.NamedList) Taxon(org.opensextant.data.Taxon) ArrayList(java.util.ArrayList) TextMatch(org.opensextant.extraction.TextMatch) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) SolrDocumentList(org.apache.solr.common.SolrDocumentList) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List)

Example 2 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.

the class SolrProxy method searchGazetteer.

/**
     * Search an OpenSextant solr gazetteer.
     *
     * @param index solr server handle
     * @param qparams search parameters
     * @return list of places
     * @throws SolrServerException on err
     */
public static List<Place> searchGazetteer(SolrServer index, SolrParams qparams) throws SolrServerException {
    QueryResponse response = index.query(qparams, SolrRequest.METHOD.GET);
    List<Place> places = new ArrayList<>();
    SolrDocumentList docList = response.getResults();
    for (SolrDocument solrDoc : docList) {
        places.add(SolrProxy.createPlace(solrDoc));
    }
    return places;
}
Also used : SolrDocument(org.apache.solr.common.SolrDocument) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) ArrayList(java.util.ArrayList) SolrDocumentList(org.apache.solr.common.SolrDocumentList) Place(org.opensextant.data.Place)

Example 3 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.

the class GazetteerMatcher method searchAdvanced.

/**
     * This is a variation on SolrGazetteer.search(), just this creates ScoredPlace which is
     * immediately usable with scoring and ranking matches. The score for a ScoredPlace is
     * created when added to PlaceCandidate: a default score is created for the place.
     * 
     * <pre>
     *    Usage:
     *    pc = PlaceCandidate();
     *    list = gaz.searchAdvanced("name:Boston", true)  // solr fielded query used as-is.
     *    for ScoredPlace p: list:
     *        pc.addPlace( p )
     * </pre>
     * 
     * @param place
     *            the place string or text; or a Solr query
     * @param as_solr
     *            the as_solr
     * @param maxLen
     *            max length of gazetteer place names.
     * @return places List of scoreable place entries
     * @throws SolrServerException
     *             the solr server exception
     */
public List<ScoredPlace> searchAdvanced(String place, boolean as_solr, int maxLen) throws SolrServerException {
    if (as_solr) {
        params.set("q", place);
    } else {
        // Bare keyword query needs to be quoted as "word word word"
        params.set("q", "\"" + place + "\"");
    }
    QueryResponse response = solr.getInternalSolrServer().query(params, SolrRequest.METHOD.GET);
    List<ScoredPlace> places = new ArrayList<>();
    for (SolrDocument solrDoc : response.getResults()) {
        /*
             * Length Filter.  Alternative: store name as string in solr, vice full text field 
             */
        if (maxLen > 0) {
            String nm = SolrProxy.getString(solrDoc, "name");
            if (nm.length() > maxLen) {
                continue;
            }
        }
        places.add(createPlace(solrDoc));
    }
    return places;
}
Also used : SolrDocument(org.apache.solr.common.SolrDocument) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) ArrayList(java.util.ArrayList)

Example 4 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.

the class GazetteerMatcher method tagText.

/**
     * Geotag a document, returning PlaceCandidates for the mentions in
     * document. Optionally just return the PlaceCandidates with name only and
     * no Place objects attached. Names of contients are passed back as matches,
     * with geo matches. Continents are filtered out by default.
     *
     * @param buffer
     *            text
     * @param docid
     *            identity of the text
     * @param tagOnly
     *            True if you wish to get the matched phrases only. False if you
     *            want the full list of Place Candidates.
     * @param fld
     *            gazetteer field to use for tagging
     * @param langid
     *             ISO lang ID 
     * @return place_candidates List of place candidates
     * @throws ExtractionException
     *             on err
     */
public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly, String fld, String langid) throws ExtractionException {
    // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
    // "startOffset":38},
    // { "ids":[750308, 2769912, 2770041, 10413973, 10417546],
    // "endOffset":49,
    // "startOffset":41},
    // ...
    // "matchingDocs":{"numFound":75, "start":0, "docs":[ {
    // "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, {
    // "place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ]
    // Reset counts.
    this.defaultFilterCount = 0;
    this.userFilterCount = 0;
    // during post-processing tags we may have to distinguish between tagging/tokenizing 
    // general vs. cjk vs. ar. But not yet though.
    // boolean useGeneralMode = DEFAULT_TAG_FIELD.equals(fld);
    long t0 = System.currentTimeMillis();
    log.debug("TEXT SIZE = {}", buffer.length());
    int[] textMetrics = TextUtils.measureCase(buffer);
    boolean isUpperCase = TextUtils.isUpperCaseDocument(textMetrics);
    boolean isLowerCase = TextUtils.isLowerCaseDocument(textMetrics);
    params.set("field", fld);
    Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
    QueryResponse response = tagTextCallSolrTagger(buffer, docid, beanMap);
    @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
    this.tagNamesTime = response.getQTime();
    long t1 = t0 + tagNamesTime;
    long t2 = System.currentTimeMillis();
    boolean geocode = !tagOnly;
    /*
         * Retrieve all offsets into a long list. These offsets will report a
         * text span and all the gazetteer record IDs that are associated to
         * that span. The text could either be a name, a code or some other
         * abbreviation.
         *
         * For practical reasons the default behavior is to filter trivial spans
         * given the gazetteer data that is returned for them.
         *
         * WARNING: lots of optimizations occur here due to the potentially
         * large volume of tags and gazetteer data that is involved. And this is
         * relatively early in the pipline.
         */
    log.debug("DOC={} TAGS SIZE={}", docid, tags.size());
    TreeMap<Integer, PlaceCandidate> candidates = new TreeMap<Integer, PlaceCandidate>();
    // names matched is used only for debugging, currently.
    Set<String> namesMatched = new HashSet<>();
    tagLoop: for (NamedList<?> tag : tags) {
        int x1 = (Integer) tag.get("startOffset");
        int x2 = (Integer) tag.get("endOffset");
        int len = x2 - x1;
        if (len == 1) {
            // Ignoring place names whose length is less than 2 chars
            ++this.defaultFilterCount;
            continue;
        }
        // +1 char after last matched
        // Could have enabled the "matchText" option from the tagger to get
        // this, but since we already have the content as a String then
        // we might as well not make the tagger do any more work.
        String matchText = (String) tag.get("matchText");
        // Get char immediately following match, for light NLP rules.
        char postChar = 0;
        char preChar = 0;
        if (x2 < buffer.length()) {
            postChar = buffer.charAt(x2);
        }
        if (x1 > 0) {
            preChar = buffer.charAt(x1 - 1);
            if (assessApostrophe(preChar, matchText)) {
                ++this.defaultFilterCount;
                continue;
            }
        }
        // be allowed. If lowercase abbreviations are allowed, then all matches are passed.               
        if (len < 3) {
            if (!allowLowercaseAbbrev) {
                if (TextUtils.isASCII(matchText) && !StringUtils.isAllUpperCase(matchText)) {
                    ++this.defaultFilterCount;
                    continue;
                }
            }
        }
        if (TextUtils.countFormattingSpace(matchText) > 1) {
            // Phrases with words broken across more than one line are not
            // valid matches.
            // Phrase with a single TAB is okay
            ++this.defaultFilterCount;
            continue;
        }
        // Eliminate any newlines and extra whitespace in match
        matchText = TextUtils.squeeze_whitespace(matchText);
        /**
             * Filter out trivial tags. Due to normalization, we tend to get
             * lots of false positives that can be eliminated early.  This is 
             * testing matches against the most general set of stop words.
             */
        if (filter.filterOut(matchText)) {
            ++this.defaultFilterCount;
            continue;
        }
        PlaceCandidate pc = new PlaceCandidate();
        pc.start = x1;
        pc.end = x2;
        pc.setText(matchText);
        /*
             * Filter out tags that user determined ahead of time as not-places
             * for their context.
             *
             */
        if (userfilter != null) {
            if (userfilter.filterOut(pc.getTextnorm())) {
                log.debug("User Filter:{}", matchText);
                ++this.userFilterCount;
                continue;
            }
        }
        /*
             * Continent filter is needed, as many mentions of contients confuse
             * real geotagging/geocoding.
             * 
             */
        if (continents.filterOut(pc.getTextnorm())) {
            pc.isContinent = true;
            pc.setFilteredOut(true);
            candidates.put(pc.start, pc);
            continue;
        }
        /**
             * Further testing is done if lang ID is provided AND if we have a stop list
             * for that language.  Otherwise, short terms are filtered out if they appear in any lang stop list.
             * NOTE: internally TagFilter here checks only languages other than English, Spanish and Vietnamese.
             */
        if (filter.filterOut(pc, langid, isUpperCase, isLowerCase)) {
            ++this.defaultFilterCount;
            log.debug("STOPWORD {} {}", langid, pc.getText());
            continue;
        }
        /*
             * Found UPPER CASE text in a mixed-cased document.
             * Conservatively, this is likely an acronym or some heading.
             * But possibly still a valid place name.
             * HEURISTIC: acronyms are relatively short. 
             * HEURISTIC: region codes can be acronyms and are valid places
             * 
             * using such place candidates you may score short acronym matches lower than fully named ones.
             * when inferring boundaries (states, provinces, etc)
             */
        if (!isUpperCase && pc.isUpper() && len < 5) {
            pc.isAcronym = true;
        }
        pc.hasDiacritics = TextUtils.hasDiacritics(pc.getText());
        pc.setSurroundingTokens(buffer);
        @SuppressWarnings("unchecked") List<Integer> placeRecordIds = (List<Integer>) tag.get("ids");
        /*
             * This assertion is helpful in debugging: assert
             * placeRecordIds.size() == new
             * HashSet<Integer>(placeRecordIds).size() : "ids should be unique";
             */
        // assert!placeRecordIds.isEmpty();
        namesMatched.clear();
        //double maxNameBias = 0.0;
        for (Integer solrId : placeRecordIds) {
            log.debug("{} = {}", pc.getText(), beanMap.get(solrId));
            // Yes, we must cast here.
            // As long as createTag generates the correct type stored in
            // beanMap we are fine.
            ScoredPlace pGeo = (ScoredPlace) beanMap.get(solrId);
            //
            if (!allowLowercaseAbbrev && pGeo.isAbbreviation() && pc.isLower()) {
                log.debug("Ignore lower case term={}", pc.getText());
                // loop and not tagLoop?
                continue tagLoop;
            }
            /*
                 * If text match contains "." and it matches any abbreviation,
                 * mark the candidate as an abbrev. TODO: Possibly best confirm
                 * this by sentence detection, as well. However, this pertains
                 * to text spans that contain "." within the bounds, and not
                 * likely an ending. E.g., "U.S." or "U.S" are trivial examples;
                 * "US" is more ambiguous, as we need to know if document is
                 * upperCase.
                 * 
                 * Any place abbreviation will trigger isAbbreviation = true
                 * 
                 * "IF YOU FIND US HERE"  the term 'US' is ambiguous here, so 
                 * it is not classified as an abbreviation. Otherwise if you have
                 * "My organization YAK happens to coincide with a place named Yak.
                 * But we first must determine if 'YAK' is a valid abbreviation for an actual place.
                 * HEURISTIC: place abbreviations are relatively short, e.g. one word(len=7 or less)
                 */
            if (len < 8 && !pc.isAbbreviation) {
                assessAbbreviation(pc, pGeo, postChar, isUpperCase);
            }
            if (log.isDebugEnabled()) {
                namesMatched.add(pGeo.getName());
            }
            /**
                 * Country names are the only names you can reasonably set ahead
                 * of time. All other names need to be assessed in context.
                 * Negate country names, e.g., "Georgia", by exception.
                 */
            if (pGeo.isCountry()) {
                pc.isCountry = true;
            }
            if (geocode) {
                pGeo.defaultHierarchicalPath();
                // Default score for geo will be calculated in PlaceCandidate
                pc.addPlace(pGeo);
            }
        }
        // to filtering)
        if (geocode && !pc.hasPlaces()) {
            log.debug("Place has no places={}", pc.getText());
            continue;
        } else {
            if (log.isDebugEnabled()) {
                log.debug("Text {} matched {}", pc.getText(), namesMatched);
            }
        }
        candidates.put(pc.start, pc);
    }
    // for tag
    long t3 = System.currentTimeMillis();
    // this.tagNamesTime = (int)(t1 - t0);
    this.getNamesTime = (int) (t2 - t1);
    this.totalTime = (int) (t3 - t0);
    if (log.isDebugEnabled()) {
        summarizeExtraction(candidates.values(), docid);
    }
    this.filteredTotal += this.defaultFilterCount + this.userFilterCount;
    this.matchedTotal += candidates.size();
    return new ArrayList<PlaceCandidate>(candidates.values());
}
Also used : HashMap(java.util.HashMap) NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) HashSet(java.util.HashSet)

Example 5 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project ORCID-Source by ORCID.

the class AppContextSolrTest method testServerRunning.

@Test
@Ignore
public void testServerRunning() throws Exception {
    SolrQuery solrQuery = new SolrQuery().setQuery("carberry");
    QueryResponse response = solrServer.query(solrQuery);
    assertNotNull(response);
}
Also used : QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) SolrQuery(org.apache.solr.client.solrj.SolrQuery) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

QueryResponse (org.apache.solr.client.solrj.response.QueryResponse)456 SolrQuery (org.apache.solr.client.solrj.SolrQuery)249 Test (org.junit.Test)156 SolrDocument (org.apache.solr.common.SolrDocument)138 SolrDocumentList (org.apache.solr.common.SolrDocumentList)131 SolrServerException (org.apache.solr.client.solrj.SolrServerException)110 IOException (java.io.IOException)93 SolrInputDocument (org.apache.solr.common.SolrInputDocument)83 ArrayList (java.util.ArrayList)70 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)65 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)63 SolrClient (org.apache.solr.client.solrj.SolrClient)47 NamedList (org.apache.solr.common.util.NamedList)43 HashMap (java.util.HashMap)36 Map (java.util.Map)36 List (java.util.List)32 SolrParams (org.apache.solr.common.params.SolrParams)32 CloudSolrClient (org.apache.solr.client.solrj.impl.CloudSolrClient)28 UpdateRequest (org.apache.solr.client.solrj.request.UpdateRequest)28 SolrQueryResponse (org.apache.solr.response.SolrQueryResponse)26