use of eu.etaxonomy.cdm.api.service.search.LuceneMultiSearch in project cdmlib by cybertaxonomy.
the class TaxonServiceImpl method findTaxaAndNamesByFullText.
@Override
public Pager<SearchResult<TaxonBase>> findTaxaAndNamesByFullText(EnumSet<TaxaAndNamesSearchMode> searchModes, String queryString, Classification classification, TaxonNode subtree, Set<NamedArea> namedAreas, Set<PresenceAbsenceTerm> distributionStatus, List<Language> languages, boolean highlightFragments, Integer pageSize, Integer pageNumber, List<OrderHint> orderHints, List<String> propertyPaths) throws IOException, LuceneParseException, LuceneMultiSearchException {
// this requires building a special sort column by a special classBridge
if (highlightFragments) {
logger.warn("findTaxaAndNamesByFullText() : fragment highlighting is " + "currently not fully supported by this method and thus " + "may not work with common names and misapplied names.");
}
// convert sets to lists
List<NamedArea> namedAreaList = null;
List<PresenceAbsenceTerm> distributionStatusList = null;
if (namedAreas != null) {
namedAreaList = new ArrayList<>(namedAreas.size());
namedAreaList.addAll(namedAreas);
}
if (distributionStatus != null) {
distributionStatusList = new ArrayList<>(distributionStatus.size());
distributionStatusList.addAll(distributionStatus);
}
// set default if parameter is null
if (searchModes == null) {
searchModes = EnumSet.of(TaxaAndNamesSearchMode.doTaxa);
}
// defined by prepare*Search methods
if (orderHints == null) {
orderHints = OrderHint.NOMENCLATURAL_SORT_ORDER.asList();
}
SortField[] sortFields = new SortField[orderHints.size()];
int i = 0;
for (OrderHint oh : orderHints) {
sortFields[i++] = oh.toSortField();
}
// SortField[] sortFields = new SortField[]{SortField.FIELD_SCORE, new SortField("id", SortField.STRING, false)};
// SortField[] sortFields = new SortField[]{new SortField(NomenclaturalSortOrderBrigde.NAME_SORT_FIELD_NAME, SortField.STRING, false)};
boolean addDistributionFilter = namedAreas != null && namedAreas.size() > 0;
List<LuceneSearch> luceneSearches = new ArrayList<>();
Map<CdmBaseType, String> idFieldMap = new HashMap<>();
/*
======== filtering by distribution , HOWTO ========
- http://www.javaranch.com/journal/2009/02/filtering-a-lucene-search.html
- http://stackoverflow.com/questions/17709256/lucene-solr-using-complex-filters -> QueryWrapperFilter
add Filter to search as http://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/Filter.html
which will be put into a FilteredQuersy in the end ?
3. how does it work in spatial?
see
- http://www.nsshutdown.com/projects/lucene/whitepaper/locallucene_v2.html
- http://www.infoq.com/articles/LuceneSpatialSupport
- http://www.mhaller.de/archives/156-Spatial-search-with-Lucene.html
------------------------------------------------------------------------
filter strategies:
A) use a separate distribution filter per index sub-query/search:
- byTaxonSyonym (query TaxaonBase):
use a join area filter (Distribution -> TaxonBase)
- byCommonName (query DescriptionElementBase): use an area filter on
DescriptionElementBase !!! PROBLEM !!!
This cannot work since the distributions are different entities than the
common names and thus these are different lucene documents.
- byMisaplliedNames (join query TaxonRelationship -> TaxonBase):
use a join area filter (Distribution -> TaxonBase)
B) use a common distribution filter for all index sub-query/searches:
- use a common join area filter (Distribution -> TaxonBase)
- also implement the byCommonName as join query (CommonName -> TaxonBase)
PROBLEM in this case: we are losing the fragment highlighting for the
common names, since the returned documents are always TaxonBases
*/
/* The QueryFactory for creating filter queries on Distributions should
* The query factory used for the common names query cannot be reused
* for this case, since we want to only record the text fields which are
* actually used in the primary query
*/
QueryFactory distributionFilterQueryFactory = luceneIndexToolProvider.newQueryFactoryFor(Distribution.class);
Builder multiIndexByAreaFilterBuilder = new Builder();
boolean includeUnpublished = searchModes.contains(TaxaAndNamesSearchMode.includeUnpublished);
// search for taxa or synonyms
if (searchModes.contains(TaxaAndNamesSearchMode.doTaxa) || searchModes.contains(TaxaAndNamesSearchMode.doSynonyms)) {
@SuppressWarnings("rawtypes") Class<? extends TaxonBase> taxonBaseSubclass = TaxonBase.class;
String className = null;
if (searchModes.contains(TaxaAndNamesSearchMode.doTaxa) && !searchModes.contains(TaxaAndNamesSearchMode.doSynonyms)) {
taxonBaseSubclass = Taxon.class;
} else if (!searchModes.contains(TaxaAndNamesSearchMode.doTaxa) && searchModes.contains(TaxaAndNamesSearchMode.doSynonyms)) {
className = "eu.etaxonomy.cdm.model.taxon.Synonym";
}
luceneSearches.add(prepareFindByFullTextSearch(taxonBaseSubclass, queryString, classification, subtree, className, includeUnpublished, languages, highlightFragments, sortFields));
idFieldMap.put(CdmBaseType.TAXON_BASE, "id");
/* A) does not work!!!!
if(addDistributionFilter){
// in this case we need a filter which uses a join query
// to get the TaxonBase documents for the DescriptionElementBase documents
// which are matching the areas in question
Query taxonAreaJoinQuery = createByDistributionJoinQuery(
namedAreaList,
distributionStatusList,
distributionFilterQueryFactory
);
multiIndexByAreaFilter.add(new QueryWrapperFilter(taxonAreaJoinQuery), Occur.SHOULD);
}
*/
if (addDistributionFilter && searchModes.contains(TaxaAndNamesSearchMode.doSynonyms)) {
// add additional area filter for synonyms
// in DescriptionElementBase index
String fromField = "inDescription.taxon.id";
// id in TaxonBase index
String toField = "accTaxon" + AcceptedTaxonBridge.DOC_KEY_ID_SUFFIX;
// TODO replace by createByDistributionJoinQuery
BooleanQuery byDistributionQuery = createByDistributionQuery(namedAreaList, distributionStatusList, distributionFilterQueryFactory);
Query taxonAreaJoinQuery = distributionFilterQueryFactory.newJoinQuery(Distribution.class, fromField, true, byDistributionQuery, toField, Taxon.class, ScoreMode.None);
multiIndexByAreaFilterBuilder.add(taxonAreaJoinQuery, Occur.SHOULD);
}
}
// search by CommonTaxonName
if (searchModes.contains(TaxaAndNamesSearchMode.doTaxaByCommonNames)) {
// B)
QueryFactory descriptionElementQueryFactory = luceneIndexToolProvider.newQueryFactoryFor(DescriptionElementBase.class);
Query byCommonNameJoinQuery = descriptionElementQueryFactory.newJoinQuery(CommonTaxonName.class, "inDescription.taxon.id", true, QueryFactory.addTypeRestriction(createByDescriptionElementFullTextQuery(queryString, classification, subtree, null, languages, descriptionElementQueryFactory), CommonTaxonName.class).build(), "id", null, ScoreMode.Max);
if (logger.isDebugEnabled()) {
logger.debug("byCommonNameJoinQuery: " + byCommonNameJoinQuery.toString());
}
LuceneSearch byCommonNameSearch = new LuceneSearch(luceneIndexToolProvider, GroupByTaxonClassBridge.GROUPBY_TAXON_FIELD, Taxon.class);
byCommonNameSearch.setCdmTypRestriction(Taxon.class);
Builder builder = new BooleanQuery.Builder();
builder.add(byCommonNameJoinQuery, Occur.MUST);
if (!includeUnpublished) {
QueryFactory taxonBaseQueryFactory = luceneIndexToolProvider.newQueryFactoryFor(TaxonBase.class);
builder.add(taxonBaseQueryFactory.newBooleanQuery("publish", true), Occur.MUST);
}
byCommonNameSearch.setQuery(builder.build());
byCommonNameSearch.setSortFields(sortFields);
idFieldMap.put(CdmBaseType.TAXON_BASE, "id");
luceneSearches.add(byCommonNameSearch);
/* A) does not work!!!!
luceneSearches.add(
prepareByDescriptionElementFullTextSearch(CommonTaxonName.class,
queryString, classification, null, languages, highlightFragments)
);
idFieldMap.put(CdmBaseType.DESCRIPTION_ELEMENT, "inDescription.taxon.id");
if(addDistributionFilter){
// in this case we are able to use DescriptionElementBase documents
// which are matching the areas in question directly
BooleanQuery byDistributionQuery = createByDistributionQuery(
namedAreaList,
distributionStatusList,
distributionFilterQueryFactory
);
multiIndexByAreaFilter.add(new QueryWrapperFilter(byDistributionQuery), Occur.SHOULD);
} */
}
// TODO merge with pro parte synonym search once #7487 is fixed
if (searchModes.contains(TaxaAndNamesSearchMode.doMisappliedNames)) /*|| searchModes.contains(TaxaAndNamesSearchMode.doSynonyms) */
{
// NOTE:
// prepareFindByTaxonRelationFullTextSearch() is making use of JoinUtil.createJoinQuery()
// which allows doing query time joins
// finds the misapplied name (Taxon B) which is an misapplication for
// a related Taxon A.
//
Set<TaxonRelationshipType> relTypes = new HashSet<>();
if (searchModes.contains(TaxaAndNamesSearchMode.doMisappliedNames)) {
relTypes.addAll(TaxonRelationshipType.allMisappliedNameTypes());
}
// if (searchModes.contains(TaxaAndNamesSearchMode.doSynonyms)){
// relTypes.addAll(TaxonRelationshipType.allSynonymTypes());
// }
luceneSearches.add(prepareFindByTaxonRelationFullTextSearch(new TaxonRelationshipEdge(relTypes, Direction.relatedTo), queryString, classification, subtree, includeUnpublished, languages, highlightFragments, sortFields));
idFieldMap.put(CdmBaseType.TAXON_BASE, "id");
if (addDistributionFilter) {
// in DescriptionElementBase index
String fromField = "inDescription.taxon.id";
/*
* Here I was facing a weird and nasty bug which took me bugging be really for hours until I found this solution.
* Maybe this is a bug in java itself.
*
* When the string toField is constructed by using the expression TaxonRelationshipType.MISAPPLIED_NAME_FOR().getUuid().toString()
* directly:
*
* String toField = "relation." + TaxonRelationshipType.MISAPPLIED_NAME_FOR().getUuid().toString() +".to.id";
*
* The byDistributionQuery fails, however when the uuid is first stored in another string variable the query
* will execute as expected:
*
* String misappliedNameForUuid = TaxonRelationshipType.MISAPPLIED_NAME_FOR().getUuid().toString();
* String toField = "relation." + misappliedNameForUuid +".to.id";
*
* Comparing both strings by the String.equals method returns true, so both String are identical.
*
* The bug occurs when running eu.etaxonomy.cdm.api.service.TaxonServiceSearchTest in eclipse and in maven and seems to to be
* dependent from a specific jvm (openjdk6 6b27-1.12.6-1ubuntu0.13.04.2, openjdk7 7u25-2.3.10-1ubuntu0.13.04.2, oracle jdk1.7.0_25 tested)
* The bug is persistent after a reboot of the development computer.
*/
// String misappliedNameForUuid = TaxonRelationshipType.MISAPPLIED_NAME_FOR().getUuid().toString();
// String toField = "relation." + misappliedNameForUuid +".to.id";
String toField = "relation.1ed87175-59dd-437e-959e-0d71583d8417.to.id";
// System.out.println("relation.1ed87175-59dd-437e-959e-0d71583d8417.to.id".equals("relation." + misappliedNameForUuid +".to.id") ? " > identical" : " > different");
// System.out.println("relation.1ed87175-59dd-437e-959e-0d71583d8417.to.id".equals("relation." + TaxonRelationshipType.MISAPPLIED_NAME_FOR().getUuid().toString() +".to.id") ? " > identical" : " > different");
// TODO replace by createByDistributionJoinQuery
BooleanQuery byDistributionQuery = createByDistributionQuery(namedAreaList, distributionStatusList, distributionFilterQueryFactory);
Query taxonAreaJoinQuery = distributionFilterQueryFactory.newJoinQuery(Distribution.class, fromField, true, byDistributionQuery, toField, null, ScoreMode.None);
// debug code for bug described above
// does not compile anymore since changing from lucene 3.6.2 to lucene 4.10+
// DocIdSet filterMatchSet = filter.getDocIdSet(luceneIndexToolProvider.getIndexReaderFor(Taxon.class));
// System.err.println(DocIdBitSetPrinter.docsAsString(filterMatchSet, 100));
multiIndexByAreaFilterBuilder.add(taxonAreaJoinQuery, Occur.SHOULD);
}
}
// search by pro parte synonyms
if (searchModes.contains(TaxaAndNamesSearchMode.doSynonyms)) {
// TODO merge with misapplied name search once #7487 is fixed
Set<TaxonRelationshipType> relTypes = new HashSet<>();
relTypes.addAll(TaxonRelationshipType.allSynonymTypes());
luceneSearches.add(prepareFindByTaxonRelationFullTextSearch(new TaxonRelationshipEdge(relTypes, Direction.relatedTo), queryString, classification, subtree, includeUnpublished, languages, highlightFragments, sortFields));
idFieldMap.put(CdmBaseType.TAXON_BASE, "id");
if (addDistributionFilter) {
// in DescriptionElementBase index
String fromField = "inDescription.taxon.id";
String toField = "relation.8a896603-0fa3-44c6-9cd7-df2d8792e577.to.id";
BooleanQuery byDistributionQuery = createByDistributionQuery(namedAreaList, distributionStatusList, distributionFilterQueryFactory);
Query taxonAreaJoinQuery = distributionFilterQueryFactory.newJoinQuery(Distribution.class, fromField, true, byDistributionQuery, toField, null, ScoreMode.None);
multiIndexByAreaFilterBuilder.add(taxonAreaJoinQuery, Occur.SHOULD);
}
}
// end pro parte synonyms
LuceneMultiSearch multiSearch = new LuceneMultiSearch(luceneIndexToolProvider, luceneSearches.toArray(new LuceneSearch[luceneSearches.size()]));
if (addDistributionFilter) {
// B)
// in this case we need a filter which uses a join query
// to get the TaxonBase documents for the DescriptionElementBase documents
// which are matching the areas in question
//
// for doTaxa, doByCommonName
Query taxonAreaJoinQuery = createByDistributionJoinQuery(namedAreaList, distributionStatusList, distributionFilterQueryFactory, Taxon.class, true);
multiIndexByAreaFilterBuilder.add(taxonAreaJoinQuery, Occur.SHOULD);
}
if (addDistributionFilter) {
multiSearch.setFilter(multiIndexByAreaFilterBuilder.build());
}
// --- execute search
TopGroups<BytesRef> topDocsResultSet;
try {
topDocsResultSet = multiSearch.executeSearch(pageSize, pageNumber);
} catch (ParseException e) {
LuceneParseException luceneParseException = new LuceneParseException(e.getMessage());
luceneParseException.setStackTrace(e.getStackTrace());
throw luceneParseException;
}
// --- initialize taxa, highlight matches ....
ISearchResultBuilder searchResultBuilder = new SearchResultBuilder(multiSearch, multiSearch.getQuery());
List<SearchResult<TaxonBase>> searchResults = searchResultBuilder.createResultSet(topDocsResultSet, multiSearch.getHighlightFields(), dao, idFieldMap, propertyPaths);
long totalHits = (topDocsResultSet != null) ? Long.valueOf(topDocsResultSet.totalGroupCount) : 0;
return new DefaultPagerImpl<>(pageNumber, totalHits, pageSize, searchResults);
}
use of eu.etaxonomy.cdm.api.service.search.LuceneMultiSearch in project cdmlib by cybertaxonomy.
the class TaxonServiceImpl method findByEverythingFullText.
@Override
public Pager<SearchResult<TaxonBase>> findByEverythingFullText(String queryString, Classification classification, TaxonNode subtree, boolean includeUnpublished, List<Language> languages, boolean highlightFragments, Integer pageSize, Integer pageNumber, List<OrderHint> orderHints, List<String> propertyPaths) throws IOException, LuceneParseException, LuceneMultiSearchException {
LuceneSearch luceneSearchByDescriptionElement = prepareByDescriptionElementFullTextSearch(null, queryString, classification, subtree, null, languages, highlightFragments);
LuceneSearch luceneSearchByTaxonBase = prepareFindByFullTextSearch(null, queryString, classification, subtree, null, includeUnpublished, languages, highlightFragments, null);
LuceneMultiSearch multiSearch = new LuceneMultiSearch(luceneIndexToolProvider, luceneSearchByDescriptionElement, luceneSearchByTaxonBase);
// --- execute search
TopGroups<BytesRef> topDocsResultSet;
try {
topDocsResultSet = multiSearch.executeSearch(pageSize, pageNumber);
} catch (ParseException e) {
LuceneParseException luceneParseException = new LuceneParseException(e.getMessage());
luceneParseException.setStackTrace(e.getStackTrace());
throw luceneParseException;
}
// --- initialize taxa, highlight matches ....
ISearchResultBuilder searchResultBuilder = new SearchResultBuilder(multiSearch, multiSearch.getQuery());
Map<CdmBaseType, String> idFieldMap = new HashMap<>();
idFieldMap.put(CdmBaseType.TAXON_BASE, "id");
idFieldMap.put(CdmBaseType.DESCRIPTION_ELEMENT, "inDescription.taxon.id");
List<SearchResult<TaxonBase>> searchResults = searchResultBuilder.createResultSet(topDocsResultSet, multiSearch.getHighlightFields(), dao, idFieldMap, propertyPaths);
int totalHits = topDocsResultSet != null ? topDocsResultSet.totalGroupCount : 0;
return new DefaultPagerImpl<>(pageNumber, Long.valueOf(totalHits), pageSize, searchResults);
}
Aggregations