use of org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField in project searchcode-server by boyter.
the class CodeIndexer method buildDocument.
/**
* Builds a document ready to be indexed by lucene
*/
public Document buildDocument(CodeIndexDocument codeIndexDocument) {
Document document = new Document();
// Path is the primary key for documents
// needs to include repo location, project name and then filepath including file
Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename(), Field.Store.YES);
document.add(pathField);
if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName())) {
document.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
}
if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName())) {
document.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
}
if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner())) {
document.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
}
this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
StringBuilder indexContents = new StringBuilder();
indexContents.append(this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getFileName())).append(" ");
indexContents.append(this.searchcodeLib.splitKeywords(codeIndexDocument.getFileName())).append(" ");
indexContents.append(codeIndexDocument.getFileLocationFilename()).append(" ");
indexContents.append(codeIndexDocument.getFileLocation());
indexContents.append(this.searchcodeLib.splitKeywords(codeIndexDocument.getContents()));
indexContents.append(this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents()));
indexContents.append(this.searchcodeLib.findInterestingKeywords(codeIndexDocument.getContents()));
indexContents.append(this.searchcodeLib.findInterestingCharacters(codeIndexDocument.getContents()));
document.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName().replace(" ", "_"), Field.Store.YES));
document.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
document.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
document.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
document.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
document.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName().replace(" ", "_"), Field.Store.YES));
document.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
document.add(new TextField(Values.CONTENTS, indexContents.toString().toLowerCase(), Field.Store.NO));
document.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
document.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner().replace(" ", "_"), Field.Store.YES));
document.add(new TextField(Values.CODEID, codeIndexDocument.getHash(), Field.Store.YES));
// Extra metadata in this case when it was last indexed
document.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
return document;
}
use of org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField in project searchcode-server by boyter.
the class CodeIndexer method indexTimeDocuments.
/**
* Given a queue of documents to index, index them by popping the queue limited to 1000 items.
* This method must be synchronized as we have not added any logic to deal with multiple threads writing to the
* index.
* TODO investigate how Lucene deals with multiple writes
*/
public synchronized void indexTimeDocuments(Queue<CodeIndexDocument> codeIndexDocumentQueue) throws IOException {
// Index all documents and commit at the end for performance gains
Directory dir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXLOCATION, Values.DEFAULTTIMEINDEXLOCATION)));
Directory facetsdir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXFACETLOCATION, Values.DEFAULTTIMEINDEXFACETLOCATION)));
Analyzer analyzer = new CodeAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
FacetsConfig facetsConfig;
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(dir, iwc);
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(facetsdir);
try {
CodeIndexDocument codeIndexDocument = codeIndexDocumentQueue.poll();
int count = 0;
while (codeIndexDocument != null) {
Singleton.getLogger().info("Indexing time file " + codeIndexDocument.getRepoLocationRepoNameLocationFilename());
this.sharedService.decrementCodeIndexLinesCount(codeIndexDocument.getCodeLines());
Document doc = new Document();
// Path is the primary key for documents
// needs to include repo location, project name and then filepath including file and revision
Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename() + ":" + codeIndexDocument.getRevision(), Field.Store.YES);
doc.add(pathField);
// Add in facets
facetsConfig = new FacetsConfig();
facetsConfig.setIndexFieldName(Values.LANGUAGENAME, Values.LANGUAGENAME);
facetsConfig.setIndexFieldName(Values.REPONAME, Values.REPONAME);
facetsConfig.setIndexFieldName(Values.CODEOWNER, Values.CODEOWNER);
facetsConfig.setIndexFieldName(Values.DATEYEARMONTHDAY, Values.DATEYEARMONTHDAY);
facetsConfig.setIndexFieldName(Values.DATEYEARMONTH, Values.DATEYEARMONTH);
facetsConfig.setIndexFieldName(Values.DATEYEAR, Values.DATEYEAR);
facetsConfig.setIndexFieldName(Values.REVISION, Values.REVISION);
facetsConfig.setIndexFieldName(Values.DELETED, Values.DELETED);
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonthDay().substring(0, 6)));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DATEYEAR, codeIndexDocument.getYearMonthDay().substring(0, 4)));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRevision()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.REVISION, codeIndexDocument.getRevision()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.isDeleted()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DELETED, codeIndexDocument.isDeleted()));
}
String indexContents = Values.EMPTYSTRING;
indexContents += this.searchcodeLib.splitKeywords(codeIndexDocument.getContents());
indexContents += this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents());
// Store in spelling corrector
this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
indexContents = indexContents.toLowerCase();
doc.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName(), Field.Store.YES));
doc.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
doc.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
doc.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
doc.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
doc.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName(), Field.Store.YES));
doc.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
doc.add(new TextField(Values.CONTENTS, indexContents, Field.Store.NO));
doc.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
doc.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner(), Field.Store.YES));
doc.add(new TextField(Values.REVISION, codeIndexDocument.getRevision(), Field.Store.YES));
doc.add(new TextField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay(), Field.Store.YES));
doc.add(new TextField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonth(), Field.Store.YES));
doc.add(new TextField(Values.DATEYEAR, codeIndexDocument.getYear(), Field.Store.YES));
doc.add(new TextField(Values.MESSAGE, codeIndexDocument.getMessage(), Field.Store.YES));
doc.add(new TextField(Values.DELETED, codeIndexDocument.isDeleted(), Field.Store.YES));
// Extra metadata in this case when it was last indexed
doc.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
writer.updateDocument(new Term(Values.PATH, codeIndexDocument.getRepoLocationRepoNameLocationFilename()), facetsConfig.build(taxoWriter, doc));
count++;
if (count >= INDEX_QUEUE_BATCH_SIZE) {
codeIndexDocument = null;
} else {
codeIndexDocument = codeIndexDocumentQueue.poll();
}
}
} finally {
Singleton.getLogger().info("Closing writers");
writer.close();
taxoWriter.close();
}
}
use of org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField in project lucene-solr by apache.
the class FacetsConfig method build.
/**
* Translates any added {@link FacetField}s into normal fields for indexing.
*
* <p>
* <b>NOTE:</b> you should add the returned document to IndexWriter, not the
* input one!
*/
public Document build(TaxonomyWriter taxoWriter, Document doc) throws IOException {
// Find all FacetFields, collated by the actual field:
Map<String, List<FacetField>> byField = new HashMap<>();
// ... and also all SortedSetDocValuesFacetFields:
Map<String, List<SortedSetDocValuesFacetField>> dvByField = new HashMap<>();
// ... and also all AssociationFacetFields
Map<String, List<AssociationFacetField>> assocByField = new HashMap<>();
Set<String> seenDims = new HashSet<>();
for (IndexableField field : doc) {
if (field.fieldType() == FacetField.TYPE) {
FacetField facetField = (FacetField) field;
FacetsConfig.DimConfig dimConfig = getDimConfig(facetField.dim);
if (dimConfig.multiValued == false) {
checkSeen(seenDims, facetField.dim);
}
String indexFieldName = dimConfig.indexFieldName;
List<FacetField> fields = byField.get(indexFieldName);
if (fields == null) {
fields = new ArrayList<>();
byField.put(indexFieldName, fields);
}
fields.add(facetField);
}
if (field.fieldType() == SortedSetDocValuesFacetField.TYPE) {
SortedSetDocValuesFacetField facetField = (SortedSetDocValuesFacetField) field;
FacetsConfig.DimConfig dimConfig = getDimConfig(facetField.dim);
if (dimConfig.multiValued == false) {
checkSeen(seenDims, facetField.dim);
}
String indexFieldName = dimConfig.indexFieldName;
List<SortedSetDocValuesFacetField> fields = dvByField.get(indexFieldName);
if (fields == null) {
fields = new ArrayList<>();
dvByField.put(indexFieldName, fields);
}
fields.add(facetField);
}
if (field.fieldType() == AssociationFacetField.TYPE) {
AssociationFacetField facetField = (AssociationFacetField) field;
FacetsConfig.DimConfig dimConfig = getDimConfig(facetField.dim);
if (dimConfig.multiValued == false) {
checkSeen(seenDims, facetField.dim);
}
if (dimConfig.hierarchical) {
throw new IllegalArgumentException("AssociationFacetField cannot be hierarchical (dim=\"" + facetField.dim + "\")");
}
if (dimConfig.requireDimCount) {
throw new IllegalArgumentException("AssociationFacetField cannot requireDimCount (dim=\"" + facetField.dim + "\")");
}
String indexFieldName = dimConfig.indexFieldName;
List<AssociationFacetField> fields = assocByField.get(indexFieldName);
if (fields == null) {
fields = new ArrayList<>();
assocByField.put(indexFieldName, fields);
}
fields.add(facetField);
// Best effort: detect mis-matched types in same
// indexed field:
String type;
if (facetField instanceof IntAssociationFacetField) {
type = "int";
} else if (facetField instanceof FloatAssociationFacetField) {
type = "float";
} else {
type = "bytes";
}
// NOTE: not thread safe, but this is just best effort:
String curType = assocDimTypes.get(indexFieldName);
if (curType == null) {
assocDimTypes.put(indexFieldName, type);
} else if (!curType.equals(type)) {
throw new IllegalArgumentException("mixing incompatible types of AssocationFacetField (" + curType + " and " + type + ") in indexed field \"" + indexFieldName + "\"; use FacetsConfig to change the indexFieldName for each dimension");
}
}
}
Document result = new Document();
processFacetFields(taxoWriter, byField, result);
processSSDVFacetFields(dvByField, result);
processAssocFacetFields(taxoWriter, assocByField, result);
for (IndexableField field : doc.getFields()) {
IndexableFieldType ft = field.fieldType();
if (ft != FacetField.TYPE && ft != SortedSetDocValuesFacetField.TYPE && ft != AssociationFacetField.TYPE) {
result.add(field);
}
}
return result;
}
Aggregations