Search in sources :

Example 6 with SortedSetDocValuesFacetField

use of org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField in project searchcode-server by boyter.

the class CodeIndexer method buildDocument.

/**
     * Builds a document ready to be indexed by lucene
     */
public Document buildDocument(CodeIndexDocument codeIndexDocument) {
    Document document = new Document();
    // Path is the primary key for documents
    // needs to include repo location, project name and then filepath including file
    Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename(), Field.Store.YES);
    document.add(pathField);
    if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName())) {
        document.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
    }
    if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName())) {
        document.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
    }
    if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner())) {
        document.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
    }
    this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
    StringBuilder indexContents = new StringBuilder();
    indexContents.append(this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getFileName())).append(" ");
    indexContents.append(this.searchcodeLib.splitKeywords(codeIndexDocument.getFileName())).append(" ");
    indexContents.append(codeIndexDocument.getFileLocationFilename()).append(" ");
    indexContents.append(codeIndexDocument.getFileLocation());
    indexContents.append(this.searchcodeLib.splitKeywords(codeIndexDocument.getContents()));
    indexContents.append(this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents()));
    indexContents.append(this.searchcodeLib.findInterestingKeywords(codeIndexDocument.getContents()));
    indexContents.append(this.searchcodeLib.findInterestingCharacters(codeIndexDocument.getContents()));
    document.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName().replace(" ", "_"), Field.Store.YES));
    document.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
    document.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
    document.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
    document.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
    document.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName().replace(" ", "_"), Field.Store.YES));
    document.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
    document.add(new TextField(Values.CONTENTS, indexContents.toString().toLowerCase(), Field.Store.NO));
    document.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
    document.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner().replace(" ", "_"), Field.Store.YES));
    document.add(new TextField(Values.CODEID, codeIndexDocument.getHash(), Field.Store.YES));
    // Extra metadata in this case when it was last indexed
    document.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
    return document;
}
Also used : SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) Date(java.util.Date)

Example 7 with SortedSetDocValuesFacetField

use of org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField in project searchcode-server by boyter.

the class CodeIndexer method indexTimeDocuments.

/**
     * Given a queue of documents to index, index them by popping the queue limited to 1000 items.
     * This method must be synchronized as we have not added any logic to deal with multiple threads writing to the
     * index.
     * TODO investigate how Lucene deals with multiple writes
     */
public synchronized void indexTimeDocuments(Queue<CodeIndexDocument> codeIndexDocumentQueue) throws IOException {
    // Index all documents and commit at the end for performance gains
    Directory dir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXLOCATION, Values.DEFAULTTIMEINDEXLOCATION)));
    Directory facetsdir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXFACETLOCATION, Values.DEFAULTTIMEINDEXFACETLOCATION)));
    Analyzer analyzer = new CodeAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    FacetsConfig facetsConfig;
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    IndexWriter writer = new IndexWriter(dir, iwc);
    TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(facetsdir);
    try {
        CodeIndexDocument codeIndexDocument = codeIndexDocumentQueue.poll();
        int count = 0;
        while (codeIndexDocument != null) {
            Singleton.getLogger().info("Indexing time file " + codeIndexDocument.getRepoLocationRepoNameLocationFilename());
            this.sharedService.decrementCodeIndexLinesCount(codeIndexDocument.getCodeLines());
            Document doc = new Document();
            // Path is the primary key for documents
            // needs to include repo location, project name and then filepath including file and revision
            Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename() + ":" + codeIndexDocument.getRevision(), Field.Store.YES);
            doc.add(pathField);
            // Add in facets
            facetsConfig = new FacetsConfig();
            facetsConfig.setIndexFieldName(Values.LANGUAGENAME, Values.LANGUAGENAME);
            facetsConfig.setIndexFieldName(Values.REPONAME, Values.REPONAME);
            facetsConfig.setIndexFieldName(Values.CODEOWNER, Values.CODEOWNER);
            facetsConfig.setIndexFieldName(Values.DATEYEARMONTHDAY, Values.DATEYEARMONTHDAY);
            facetsConfig.setIndexFieldName(Values.DATEYEARMONTH, Values.DATEYEARMONTH);
            facetsConfig.setIndexFieldName(Values.DATEYEAR, Values.DATEYEAR);
            facetsConfig.setIndexFieldName(Values.REVISION, Values.REVISION);
            facetsConfig.setIndexFieldName(Values.DELETED, Values.DELETED);
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonthDay().substring(0, 6)));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEAR, codeIndexDocument.getYearMonthDay().substring(0, 4)));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRevision()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.REVISION, codeIndexDocument.getRevision()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.isDeleted()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DELETED, codeIndexDocument.isDeleted()));
            }
            String indexContents = Values.EMPTYSTRING;
            indexContents += this.searchcodeLib.splitKeywords(codeIndexDocument.getContents());
            indexContents += this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents());
            // Store in spelling corrector
            this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
            indexContents = indexContents.toLowerCase();
            doc.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName(), Field.Store.YES));
            doc.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
            doc.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
            doc.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
            doc.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
            doc.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName(), Field.Store.YES));
            doc.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
            doc.add(new TextField(Values.CONTENTS, indexContents, Field.Store.NO));
            doc.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
            doc.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner(), Field.Store.YES));
            doc.add(new TextField(Values.REVISION, codeIndexDocument.getRevision(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonth(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEAR, codeIndexDocument.getYear(), Field.Store.YES));
            doc.add(new TextField(Values.MESSAGE, codeIndexDocument.getMessage(), Field.Store.YES));
            doc.add(new TextField(Values.DELETED, codeIndexDocument.isDeleted(), Field.Store.YES));
            // Extra metadata in this case when it was last indexed
            doc.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
            writer.updateDocument(new Term(Values.PATH, codeIndexDocument.getRepoLocationRepoNameLocationFilename()), facetsConfig.build(taxoWriter, doc));
            count++;
            if (count >= INDEX_QUEUE_BATCH_SIZE) {
                codeIndexDocument = null;
            } else {
                codeIndexDocument = codeIndexDocumentQueue.poll();
            }
        }
    } finally {
        Singleton.getLogger().info("Closing writers");
        writer.close();
        taxoWriter.close();
    }
}
Also used : FacetsConfig(org.apache.lucene.facet.FacetsConfig) Term(org.apache.lucene.index.Term) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) Date(java.util.Date) DirectoryTaxonomyWriter(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) TaxonomyWriter(org.apache.lucene.facet.taxonomy.TaxonomyWriter) DirectoryTaxonomyWriter(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) IndexWriter(org.apache.lucene.index.IndexWriter) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 8 with SortedSetDocValuesFacetField

use of org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField in project lucene-solr by apache.

the class FacetsConfig method build.

/**
   * Translates any added {@link FacetField}s into normal fields for indexing.
   * 
   * <p>
   * <b>NOTE:</b> you should add the returned document to IndexWriter, not the
   * input one!
   */
public Document build(TaxonomyWriter taxoWriter, Document doc) throws IOException {
    // Find all FacetFields, collated by the actual field:
    Map<String, List<FacetField>> byField = new HashMap<>();
    // ... and also all SortedSetDocValuesFacetFields:
    Map<String, List<SortedSetDocValuesFacetField>> dvByField = new HashMap<>();
    // ... and also all AssociationFacetFields
    Map<String, List<AssociationFacetField>> assocByField = new HashMap<>();
    Set<String> seenDims = new HashSet<>();
    for (IndexableField field : doc) {
        if (field.fieldType() == FacetField.TYPE) {
            FacetField facetField = (FacetField) field;
            FacetsConfig.DimConfig dimConfig = getDimConfig(facetField.dim);
            if (dimConfig.multiValued == false) {
                checkSeen(seenDims, facetField.dim);
            }
            String indexFieldName = dimConfig.indexFieldName;
            List<FacetField> fields = byField.get(indexFieldName);
            if (fields == null) {
                fields = new ArrayList<>();
                byField.put(indexFieldName, fields);
            }
            fields.add(facetField);
        }
        if (field.fieldType() == SortedSetDocValuesFacetField.TYPE) {
            SortedSetDocValuesFacetField facetField = (SortedSetDocValuesFacetField) field;
            FacetsConfig.DimConfig dimConfig = getDimConfig(facetField.dim);
            if (dimConfig.multiValued == false) {
                checkSeen(seenDims, facetField.dim);
            }
            String indexFieldName = dimConfig.indexFieldName;
            List<SortedSetDocValuesFacetField> fields = dvByField.get(indexFieldName);
            if (fields == null) {
                fields = new ArrayList<>();
                dvByField.put(indexFieldName, fields);
            }
            fields.add(facetField);
        }
        if (field.fieldType() == AssociationFacetField.TYPE) {
            AssociationFacetField facetField = (AssociationFacetField) field;
            FacetsConfig.DimConfig dimConfig = getDimConfig(facetField.dim);
            if (dimConfig.multiValued == false) {
                checkSeen(seenDims, facetField.dim);
            }
            if (dimConfig.hierarchical) {
                throw new IllegalArgumentException("AssociationFacetField cannot be hierarchical (dim=\"" + facetField.dim + "\")");
            }
            if (dimConfig.requireDimCount) {
                throw new IllegalArgumentException("AssociationFacetField cannot requireDimCount (dim=\"" + facetField.dim + "\")");
            }
            String indexFieldName = dimConfig.indexFieldName;
            List<AssociationFacetField> fields = assocByField.get(indexFieldName);
            if (fields == null) {
                fields = new ArrayList<>();
                assocByField.put(indexFieldName, fields);
            }
            fields.add(facetField);
            // Best effort: detect mis-matched types in same
            // indexed field:
            String type;
            if (facetField instanceof IntAssociationFacetField) {
                type = "int";
            } else if (facetField instanceof FloatAssociationFacetField) {
                type = "float";
            } else {
                type = "bytes";
            }
            // NOTE: not thread safe, but this is just best effort:
            String curType = assocDimTypes.get(indexFieldName);
            if (curType == null) {
                assocDimTypes.put(indexFieldName, type);
            } else if (!curType.equals(type)) {
                throw new IllegalArgumentException("mixing incompatible types of AssocationFacetField (" + curType + " and " + type + ") in indexed field \"" + indexFieldName + "\"; use FacetsConfig to change the indexFieldName for each dimension");
            }
        }
    }
    Document result = new Document();
    processFacetFields(taxoWriter, byField, result);
    processSSDVFacetFields(dvByField, result);
    processAssocFacetFields(taxoWriter, assocByField, result);
    for (IndexableField field : doc.getFields()) {
        IndexableFieldType ft = field.fieldType();
        if (ft != FacetField.TYPE && ft != SortedSetDocValuesFacetField.TYPE && ft != AssociationFacetField.TYPE) {
            result.add(field);
        }
    }
    return result;
}
Also used : HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) FloatAssociationFacetField(org.apache.lucene.facet.taxonomy.FloatAssociationFacetField) FloatAssociationFacetField(org.apache.lucene.facet.taxonomy.FloatAssociationFacetField) AssociationFacetField(org.apache.lucene.facet.taxonomy.AssociationFacetField) IntAssociationFacetField(org.apache.lucene.facet.taxonomy.IntAssociationFacetField) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) FloatAssociationFacetField(org.apache.lucene.facet.taxonomy.FloatAssociationFacetField) AssociationFacetField(org.apache.lucene.facet.taxonomy.AssociationFacetField) IntAssociationFacetField(org.apache.lucene.facet.taxonomy.IntAssociationFacetField) Document(org.apache.lucene.document.Document) IndexableField(org.apache.lucene.index.IndexableField) IndexableFieldType(org.apache.lucene.index.IndexableFieldType) ArrayList(java.util.ArrayList) List(java.util.List) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) IntAssociationFacetField(org.apache.lucene.facet.taxonomy.IntAssociationFacetField) HashSet(java.util.HashSet)

Aggregations

SortedSetDocValuesFacetField (org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField)8 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Document (org.apache.lucene.document.Document)3 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)3 BytesRef (org.apache.lucene.util.BytesRef)3 CodeIndexDocument (com.searchcode.app.dto.CodeIndexDocument)2 Date (java.util.Date)2 HashSet (java.util.HashSet)2 List (java.util.List)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 StringField (org.apache.lucene.document.StringField)2 DirectoryTaxonomyWriter (org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 Term (org.apache.lucene.index.Term)2 Directory (org.apache.lucene.store.Directory)2 CodeAnalyzer (com.searchcode.app.util.CodeAnalyzer)1 IOException (java.io.IOException)1 Map (java.util.Map)1 Analyzer (org.apache.lucene.analysis.Analyzer)1