Search in sources :

Example 16 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project ORCID-Source by ORCID.

the class FundingSubtypeSolrDaoImpl method getFundingTypes.

@Override
public List<OrgDefinedFundingTypeSolrDocument> getFundingTypes(String searchTerm, int firstResult, int maxResult) {
    SolrQuery query = new SolrQuery();
    query.setQuery("{!edismax qf='org-defined-funding-type^50.0 text^1.0' pf='org-defined-funding-type^50.0' mm=1 sort='score desc'}" + searchTerm + "*").setFields("*");
    try {
        QueryResponse queryResponse = solrServerReadOnly.query(query);
        return queryResponse.getBeans(OrgDefinedFundingTypeSolrDocument.class);
    } catch (SolrServerException se) {
        String errorMessage = MessageFormat.format("Error when attempting to search for orgs, with search term {0}", new Object[] { searchTerm });
        throw new NonTransientDataAccessResourceException(errorMessage, se);
    }
}
Also used : NonTransientDataAccessResourceException(org.springframework.dao.NonTransientDataAccessResourceException) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrQuery(org.apache.solr.client.solrj.SolrQuery)

Example 17 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project ORCID-Source by ORCID.

the class SolrDaoImpl method findByOrcidAsReader.

@Override
public Reader findByOrcidAsReader(String orcid) {
    SolrQuery query = new SolrQuery();
    query.setQuery(ORCID + ":\"" + orcid + "\"").setFields(SCORE, ORCID, PUBLIC_PROFILE);
    query.add("wt", "orcidProfile");
    try {
        QueryResponse queryResponse = solrServerForStreaming.query(query);
        InputStream inputStream = (InputStream) queryResponse.getResponse().get("stream");
        return new InputStreamReader(inputStream, "UTF-8");
    } catch (SolrServerException | SolrException e) {
        String errorMessage = MessageFormat.format("Error when attempting to retrieve stream for orcid {0}", new Object[] { orcid });
        throw new NonTransientDataAccessResourceException(errorMessage, e);
    } catch (UnsupportedEncodingException e) {
        throw new RuntimeException(e);
    }
}
Also used : NonTransientDataAccessResourceException(org.springframework.dao.NonTransientDataAccessResourceException) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) SolrServerException(org.apache.solr.client.solrj.SolrServerException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) SolrQuery(org.apache.solr.client.solrj.SolrQuery) SolrException(org.apache.solr.common.SolrException)

Example 18 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project ORCID-Source by ORCID.

the class SolrDaoImpl method retrieveLastModified.

@Override
public Date retrieveLastModified(String orcid) {
    SolrQuery query = new SolrQuery();
    query.setQuery(ORCID + ":\"" + orcid + "\"");
    query.setFields(PROFILE_LAST_MODIFIED_DATE);
    try {
        QueryResponse response = solrServer.query(query);
        List<SolrDocument> results = response.getResults();
        if (results.isEmpty()) {
            return null;
        } else {
            return (Date) results.get(0).getFieldValue(PROFILE_LAST_MODIFIED_DATE);
        }
    } catch (SolrServerException e) {
        throw new NonTransientDataAccessResourceException("Error retrieving last modified date from SOLR Server", e);
    }
}
Also used : NonTransientDataAccessResourceException(org.springframework.dao.NonTransientDataAccessResourceException) SolrDocument(org.apache.solr.common.SolrDocument) OrcidSolrDocument(org.orcid.utils.solr.entities.OrcidSolrDocument) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrQuery(org.apache.solr.client.solrj.SolrQuery) Date(java.util.Date)

Example 19 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.

the class GazetteerMatcher method searchAdvanced.

/**
     * This is a variation on SolrGazetteer.search(), just this creates ScoredPlace which is
     * immediately usable with scoring and ranking matches. The score for a ScoredPlace is
     * created when added to PlaceCandidate: a default score is created for the place.
     * 
     * <pre>
     *    Usage:
     *    pc = PlaceCandidate();
     *    list = gaz.searchAdvanced("name:Boston", true)  // solr fielded query used as-is.
     *    for ScoredPlace p: list:
     *        pc.addPlace( p )
     * </pre>
     * 
     * @param place
     *            the place string or text; or a Solr query
     * @param as_solr
     *            the as_solr
     * @param maxLen
     *            max length of gazetteer place names.
     * @return places List of scoreable place entries
     * @throws SolrServerException
     *             the solr server exception
     */
public List<ScoredPlace> searchAdvanced(String place, boolean as_solr, int maxLen) throws SolrServerException {
    if (as_solr) {
        params.set("q", place);
    } else {
        // Bare keyword query needs to be quoted as "word word word"
        params.set("q", "\"" + place + "\"");
    }
    QueryResponse response = solr.getInternalSolrServer().query(params, SolrRequest.METHOD.GET);
    List<ScoredPlace> places = new ArrayList<>();
    for (SolrDocument solrDoc : response.getResults()) {
        /*
             * Length Filter.  Alternative: store name as string in solr, vice full text field 
             */
        if (maxLen > 0) {
            String nm = SolrProxy.getString(solrDoc, "name");
            if (nm.length() > maxLen) {
                continue;
            }
        }
        places.add(createPlace(solrDoc));
    }
    return places;
}
Also used : SolrDocument(org.apache.solr.common.SolrDocument) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) ArrayList(java.util.ArrayList)

Example 20 with QueryResponse

use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.

the class GazetteerMatcher method tagText.

/**
     * Geotag a document, returning PlaceCandidates for the mentions in
     * document. Optionally just return the PlaceCandidates with name only and
     * no Place objects attached. Names of contients are passed back as matches,
     * with geo matches. Continents are filtered out by default.
     *
     * @param buffer
     *            text
     * @param docid
     *            identity of the text
     * @param tagOnly
     *            True if you wish to get the matched phrases only. False if you
     *            want the full list of Place Candidates.
     * @param fld
     *            gazetteer field to use for tagging
     * @param langid
     *             ISO lang ID 
     * @return place_candidates List of place candidates
     * @throws ExtractionException
     *             on err
     */
public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly, String fld, String langid) throws ExtractionException {
    // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
    // "startOffset":38},
    // { "ids":[750308, 2769912, 2770041, 10413973, 10417546],
    // "endOffset":49,
    // "startOffset":41},
    // ...
    // "matchingDocs":{"numFound":75, "start":0, "docs":[ {
    // "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, {
    // "place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ]
    // Reset counts.
    this.defaultFilterCount = 0;
    this.userFilterCount = 0;
    // during post-processing tags we may have to distinguish between tagging/tokenizing 
    // general vs. cjk vs. ar. But not yet though.
    // boolean useGeneralMode = DEFAULT_TAG_FIELD.equals(fld);
    long t0 = System.currentTimeMillis();
    log.debug("TEXT SIZE = {}", buffer.length());
    int[] textMetrics = TextUtils.measureCase(buffer);
    boolean isUpperCase = TextUtils.isUpperCaseDocument(textMetrics);
    boolean isLowerCase = TextUtils.isLowerCaseDocument(textMetrics);
    params.set("field", fld);
    Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
    QueryResponse response = tagTextCallSolrTagger(buffer, docid, beanMap);
    @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
    this.tagNamesTime = response.getQTime();
    long t1 = t0 + tagNamesTime;
    long t2 = System.currentTimeMillis();
    boolean geocode = !tagOnly;
    /*
         * Retrieve all offsets into a long list. These offsets will report a
         * text span and all the gazetteer record IDs that are associated to
         * that span. The text could either be a name, a code or some other
         * abbreviation.
         *
         * For practical reasons the default behavior is to filter trivial spans
         * given the gazetteer data that is returned for them.
         *
         * WARNING: lots of optimizations occur here due to the potentially
         * large volume of tags and gazetteer data that is involved. And this is
         * relatively early in the pipline.
         */
    log.debug("DOC={} TAGS SIZE={}", docid, tags.size());
    TreeMap<Integer, PlaceCandidate> candidates = new TreeMap<Integer, PlaceCandidate>();
    // names matched is used only for debugging, currently.
    Set<String> namesMatched = new HashSet<>();
    tagLoop: for (NamedList<?> tag : tags) {
        int x1 = (Integer) tag.get("startOffset");
        int x2 = (Integer) tag.get("endOffset");
        int len = x2 - x1;
        if (len == 1) {
            // Ignoring place names whose length is less than 2 chars
            ++this.defaultFilterCount;
            continue;
        }
        // +1 char after last matched
        // Could have enabled the "matchText" option from the tagger to get
        // this, but since we already have the content as a String then
        // we might as well not make the tagger do any more work.
        String matchText = (String) tag.get("matchText");
        // Get char immediately following match, for light NLP rules.
        char postChar = 0;
        char preChar = 0;
        if (x2 < buffer.length()) {
            postChar = buffer.charAt(x2);
        }
        if (x1 > 0) {
            preChar = buffer.charAt(x1 - 1);
            if (assessApostrophe(preChar, matchText)) {
                ++this.defaultFilterCount;
                continue;
            }
        }
        // be allowed. If lowercase abbreviations are allowed, then all matches are passed.               
        if (len < 3) {
            if (!allowLowercaseAbbrev) {
                if (TextUtils.isASCII(matchText) && !StringUtils.isAllUpperCase(matchText)) {
                    ++this.defaultFilterCount;
                    continue;
                }
            }
        }
        if (TextUtils.countFormattingSpace(matchText) > 1) {
            // Phrases with words broken across more than one line are not
            // valid matches.
            // Phrase with a single TAB is okay
            ++this.defaultFilterCount;
            continue;
        }
        // Eliminate any newlines and extra whitespace in match
        matchText = TextUtils.squeeze_whitespace(matchText);
        /**
             * Filter out trivial tags. Due to normalization, we tend to get
             * lots of false positives that can be eliminated early.  This is 
             * testing matches against the most general set of stop words.
             */
        if (filter.filterOut(matchText)) {
            ++this.defaultFilterCount;
            continue;
        }
        PlaceCandidate pc = new PlaceCandidate();
        pc.start = x1;
        pc.end = x2;
        pc.setText(matchText);
        /*
             * Filter out tags that user determined ahead of time as not-places
             * for their context.
             *
             */
        if (userfilter != null) {
            if (userfilter.filterOut(pc.getTextnorm())) {
                log.debug("User Filter:{}", matchText);
                ++this.userFilterCount;
                continue;
            }
        }
        /*
             * Continent filter is needed, as many mentions of contients confuse
             * real geotagging/geocoding.
             * 
             */
        if (continents.filterOut(pc.getTextnorm())) {
            pc.isContinent = true;
            pc.setFilteredOut(true);
            candidates.put(pc.start, pc);
            continue;
        }
        /**
             * Further testing is done if lang ID is provided AND if we have a stop list
             * for that language.  Otherwise, short terms are filtered out if they appear in any lang stop list.
             * NOTE: internally TagFilter here checks only languages other than English, Spanish and Vietnamese.
             */
        if (filter.filterOut(pc, langid, isUpperCase, isLowerCase)) {
            ++this.defaultFilterCount;
            log.debug("STOPWORD {} {}", langid, pc.getText());
            continue;
        }
        /*
             * Found UPPER CASE text in a mixed-cased document.
             * Conservatively, this is likely an acronym or some heading.
             * But possibly still a valid place name.
             * HEURISTIC: acronyms are relatively short. 
             * HEURISTIC: region codes can be acronyms and are valid places
             * 
             * using such place candidates you may score short acronym matches lower than fully named ones.
             * when inferring boundaries (states, provinces, etc)
             */
        if (!isUpperCase && pc.isUpper() && len < 5) {
            pc.isAcronym = true;
        }
        pc.hasDiacritics = TextUtils.hasDiacritics(pc.getText());
        pc.setSurroundingTokens(buffer);
        @SuppressWarnings("unchecked") List<Integer> placeRecordIds = (List<Integer>) tag.get("ids");
        /*
             * This assertion is helpful in debugging: assert
             * placeRecordIds.size() == new
             * HashSet<Integer>(placeRecordIds).size() : "ids should be unique";
             */
        // assert!placeRecordIds.isEmpty();
        namesMatched.clear();
        //double maxNameBias = 0.0;
        for (Integer solrId : placeRecordIds) {
            log.debug("{} = {}", pc.getText(), beanMap.get(solrId));
            // Yes, we must cast here.
            // As long as createTag generates the correct type stored in
            // beanMap we are fine.
            ScoredPlace pGeo = (ScoredPlace) beanMap.get(solrId);
            //
            if (!allowLowercaseAbbrev && pGeo.isAbbreviation() && pc.isLower()) {
                log.debug("Ignore lower case term={}", pc.getText());
                // loop and not tagLoop?
                continue tagLoop;
            }
            /*
                 * If text match contains "." and it matches any abbreviation,
                 * mark the candidate as an abbrev. TODO: Possibly best confirm
                 * this by sentence detection, as well. However, this pertains
                 * to text spans that contain "." within the bounds, and not
                 * likely an ending. E.g., "U.S." or "U.S" are trivial examples;
                 * "US" is more ambiguous, as we need to know if document is
                 * upperCase.
                 * 
                 * Any place abbreviation will trigger isAbbreviation = true
                 * 
                 * "IF YOU FIND US HERE"  the term 'US' is ambiguous here, so 
                 * it is not classified as an abbreviation. Otherwise if you have
                 * "My organization YAK happens to coincide with a place named Yak.
                 * But we first must determine if 'YAK' is a valid abbreviation for an actual place.
                 * HEURISTIC: place abbreviations are relatively short, e.g. one word(len=7 or less)
                 */
            if (len < 8 && !pc.isAbbreviation) {
                assessAbbreviation(pc, pGeo, postChar, isUpperCase);
            }
            if (log.isDebugEnabled()) {
                namesMatched.add(pGeo.getName());
            }
            /**
                 * Country names are the only names you can reasonably set ahead
                 * of time. All other names need to be assessed in context.
                 * Negate country names, e.g., "Georgia", by exception.
                 */
            if (pGeo.isCountry()) {
                pc.isCountry = true;
            }
            if (geocode) {
                pGeo.defaultHierarchicalPath();
                // Default score for geo will be calculated in PlaceCandidate
                pc.addPlace(pGeo);
            }
        }
        // to filtering)
        if (geocode && !pc.hasPlaces()) {
            log.debug("Place has no places={}", pc.getText());
            continue;
        } else {
            if (log.isDebugEnabled()) {
                log.debug("Text {} matched {}", pc.getText(), namesMatched);
            }
        }
        candidates.put(pc.start, pc);
    }
    // for tag
    long t3 = System.currentTimeMillis();
    // this.tagNamesTime = (int)(t1 - t0);
    this.getNamesTime = (int) (t2 - t1);
    this.totalTime = (int) (t3 - t0);
    if (log.isDebugEnabled()) {
        summarizeExtraction(candidates.values(), docid);
    }
    this.filteredTotal += this.defaultFilterCount + this.userFilterCount;
    this.matchedTotal += candidates.size();
    return new ArrayList<PlaceCandidate>(candidates.values());
}
Also used : HashMap(java.util.HashMap) NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) HashSet(java.util.HashSet)

Aggregations

QueryResponse (org.apache.solr.client.solrj.response.QueryResponse)285 SolrQuery (org.apache.solr.client.solrj.SolrQuery)123 Test (org.junit.Test)111 SolrDocument (org.apache.solr.common.SolrDocument)78 SolrInputDocument (org.apache.solr.common.SolrInputDocument)67 SolrDocumentList (org.apache.solr.common.SolrDocumentList)60 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)58 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)56 SolrServerException (org.apache.solr.client.solrj.SolrServerException)42 ArrayList (java.util.ArrayList)39 IOException (java.io.IOException)35 NamedList (org.apache.solr.common.util.NamedList)32 SolrClient (org.apache.solr.client.solrj.SolrClient)27 SolrParams (org.apache.solr.common.params.SolrParams)27 CloudSolrClient (org.apache.solr.client.solrj.impl.CloudSolrClient)26 ErrorTrackingConcurrentUpdateSolrClient (org.apache.solr.client.solrj.embedded.SolrExampleStreamingTest.ErrorTrackingConcurrentUpdateSolrClient)25 UpdateRequest (org.apache.solr.client.solrj.request.UpdateRequest)25 SolrQueryResponse (org.apache.solr.response.SolrQueryResponse)23 HashMap (java.util.HashMap)21 PivotField (org.apache.solr.client.solrj.response.PivotField)19