Examples with CodeAnalyzer - com.searchcode.app.util.CodeAnalyzer

Example 1 with CodeAnalyzer

use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.

the class CodeIndexer method indexTimeDocuments.

/**
     * Given a queue of documents to index, index them by popping the queue limited to 1000 items.
     * This method must be synchronized as we have not added any logic to deal with multiple threads writing to the
     * index.
     * TODO investigate how Lucene deals with multiple writes
     */
public synchronized void indexTimeDocuments(Queue<CodeIndexDocument> codeIndexDocumentQueue) throws IOException {
    // Index all documents and commit at the end for performance gains
    Directory dir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXLOCATION, Values.DEFAULTTIMEINDEXLOCATION)));
    Directory facetsdir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXFACETLOCATION, Values.DEFAULTTIMEINDEXFACETLOCATION)));
    Analyzer analyzer = new CodeAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    FacetsConfig facetsConfig;
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    IndexWriter writer = new IndexWriter(dir, iwc);
    TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(facetsdir);
    try {
        CodeIndexDocument codeIndexDocument = codeIndexDocumentQueue.poll();
        int count = 0;
        while (codeIndexDocument != null) {
            Singleton.getLogger().info("Indexing time file " + codeIndexDocument.getRepoLocationRepoNameLocationFilename());
            this.sharedService.decrementCodeIndexLinesCount(codeIndexDocument.getCodeLines());
            Document doc = new Document();
            // Path is the primary key for documents
            // needs to include repo location, project name and then filepath including file and revision
            Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename() + ":" + codeIndexDocument.getRevision(), Field.Store.YES);
            doc.add(pathField);
            // Add in facets
            facetsConfig = new FacetsConfig();
            facetsConfig.setIndexFieldName(Values.LANGUAGENAME, Values.LANGUAGENAME);
            facetsConfig.setIndexFieldName(Values.REPONAME, Values.REPONAME);
            facetsConfig.setIndexFieldName(Values.CODEOWNER, Values.CODEOWNER);
            facetsConfig.setIndexFieldName(Values.DATEYEARMONTHDAY, Values.DATEYEARMONTHDAY);
            facetsConfig.setIndexFieldName(Values.DATEYEARMONTH, Values.DATEYEARMONTH);
            facetsConfig.setIndexFieldName(Values.DATEYEAR, Values.DATEYEAR);
            facetsConfig.setIndexFieldName(Values.REVISION, Values.REVISION);
            facetsConfig.setIndexFieldName(Values.DELETED, Values.DELETED);
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonthDay().substring(0, 6)));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEAR, codeIndexDocument.getYearMonthDay().substring(0, 4)));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRevision()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.REVISION, codeIndexDocument.getRevision()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.isDeleted()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DELETED, codeIndexDocument.isDeleted()));
            }
            String indexContents = Values.EMPTYSTRING;
            indexContents += this.searchcodeLib.splitKeywords(codeIndexDocument.getContents());
            indexContents += this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents());
            // Store in spelling corrector
            this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
            indexContents = indexContents.toLowerCase();
            doc.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName(), Field.Store.YES));
            doc.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
            doc.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
            doc.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
            doc.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
            doc.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName(), Field.Store.YES));
            doc.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
            doc.add(new TextField(Values.CONTENTS, indexContents, Field.Store.NO));
            doc.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
            doc.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner(), Field.Store.YES));
            doc.add(new TextField(Values.REVISION, codeIndexDocument.getRevision(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonth(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEAR, codeIndexDocument.getYear(), Field.Store.YES));
            doc.add(new TextField(Values.MESSAGE, codeIndexDocument.getMessage(), Field.Store.YES));
            doc.add(new TextField(Values.DELETED, codeIndexDocument.isDeleted(), Field.Store.YES));
            // Extra metadata in this case when it was last indexed
            doc.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
            writer.updateDocument(new Term(Values.PATH, codeIndexDocument.getRepoLocationRepoNameLocationFilename()), facetsConfig.build(taxoWriter, doc));
            count++;
            if (count >= INDEX_QUEUE_BATCH_SIZE) {
                codeIndexDocument = null;
            } else {
                codeIndexDocument = codeIndexDocumentQueue.poll();
            }
        }
    } finally {
        Singleton.getLogger().info("Closing writers");
        writer.close();
        taxoWriter.close();
    }
}

Also used : FacetsConfig(org.apache.lucene.facet.FacetsConfig) Term(org.apache.lucene.index.Term) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) Date(java.util.Date) DirectoryTaxonomyWriter(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) TaxonomyWriter(org.apache.lucene.facet.taxonomy.TaxonomyWriter) DirectoryTaxonomyWriter(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) IndexWriter(org.apache.lucene.index.IndexWriter) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 2 with CodeAnalyzer

use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.

the class TimeCodeSearcher method getRepoDocuments.

public List<String> getRepoDocuments(String repoName) {
    List<String> fileLocations = new ArrayList<>();
    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new CodeAnalyzer();
        QueryParser parser = new QueryParser(CODEFIELD, analyzer);
        Query query = parser.parse(Values.REPONAME + ":" + repoName);
        TopDocs results = searcher.search(query, Integer.MAX_VALUE);
        ScoreDoc[] hits = results.scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            Document doc = searcher.doc(hits[i].doc);
            fileLocations.add(doc.get(Values.FILELOCATIONFILENAME));
        }
        reader.close();
    } catch (Exception ex) {
        LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
    }
    return fileLocations;
}

Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) ArrayList(java.util.ArrayList) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) IOException(java.io.IOException) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader)

Example 3 with CodeAnalyzer

use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.

the class TimeCodeSearcher method getByRepoFileName.

/**
 * Attempts to find a unique file given the repository name and the path/filename however
 * it seems to randomly not find things for some files. No idea of the root cause at this point and have implemented
 * a work around where we get the file by getById which is no ideal. The bug appears to be due to some issue
 * inside lucene itself as using raw queries to pull back the file results in no matches, and yet it does appear
 * when not limiting to the repo
 * TODO investigate the lucene issue that occurs here mentioned above
 * TODO needs to use the revision number here as well to get the right value
 */
public CodeResult getByRepoFileName(String repo, String fileName) {
    CodeResult codeResult = null;
    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new CodeAnalyzer();
        QueryParser parser = new QueryParser(CODEFIELD, analyzer);
        // TODO I have a feeling this may not be unique if there are to files in the same directory with different case... something to investigate
        Query query = parser.parse(Values.FILELOCATIONFILENAME + ":" + QueryParser.escape(repo + "/" + fileName));
        Singleton.getLogger().info("Query to get by filename = " + Values.FILELOCATIONFILENAME + ":" + QueryParser.escape(repo + "/" + fileName));
        TopDocs results = searcher.search(query, 1);
        ScoreDoc[] hits = results.scoreDocs;
        if (hits.length != 0) {
            Document doc = searcher.doc(hits[0].doc);
            String filepath = doc.get(Values.PATH);
            List<String> code = new ArrayList<>();
            try {
                code = Files.readAllLines(Paths.get(filepath), StandardCharsets.UTF_8);
                code = Singleton.getHelpers().readFileLinesGuessEncoding(filepath, Singleton.getHelpers().tryParseInt(Properties.getProperties().getProperty(Values.MAXFILELINEDEPTH, Values.DEFAULTMAXFILELINEDEPTH), Values.DEFAULTMAXFILELINEDEPTH));
            } catch (Exception ex) {
                Singleton.getLogger().info("Indexed file appears to binary: " + filepath);
            }
            codeResult = new CodeResult(code, null);
            codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME));
            codeResult.setFileName(doc.get(Values.FILENAME));
            codeResult.setLanguageName(doc.get(Values.LANGUAGENAME));
            codeResult.setMd5hash(doc.get(Values.MD5HASH));
            codeResult.setCodeLines(doc.get(Values.CODELINES));
            codeResult.setDocumentId(hits[0].doc);
            codeResult.setRepoName(doc.get(Values.REPONAME));
            codeResult.setRepoLocation(doc.get(Values.REPOLOCATION));
            codeResult.setCodeOwner(doc.get(Values.CODEOWNER));
        }
        reader.close();
    } catch (Exception ex) {
        LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
    }
    return codeResult;
}

Example 4 with CodeAnalyzer

use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.

the class TimeCodeSearcher method search.

/**
 * Given a query and what page of results we are on return the matching results for that search
 */
public SearchResult search(String queryString, int page) {
    SearchResult searchResult = new SearchResult();
    statsService.incrementSearchCount();
    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new CodeAnalyzer();
        QueryParser parser = new QueryParser(CODEFIELD, analyzer);
        Query query = parser.parse(queryString);
        LOGGER.info("Searching for: " + query.toString(CODEFIELD));
        searchResult = this.doPagingSearch(reader, searcher, query, page);
        reader.close();
    } catch (Exception ex) {
    // LOGGER.warning(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
    }
    return searchResult;
}

Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) IndexReader(org.apache.lucene.index.IndexReader) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) IOException(java.io.IOException)

Example 5 with CodeAnalyzer

use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.

the class CodeIndexer method deleteByCodeId.

/**
     * Deletes a file from the index using the code id which seems to be
     * the most reliable way of doing it
     * TODO Update the record and set the facets to a value we can ignore
     */
public synchronized void deleteByCodeId(String codeId) throws IOException {
    Directory dir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.INDEXLOCATION, Values.DEFAULTINDEXLOCATION)));
    Analyzer analyzer = new CodeAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    IndexWriter writer = new IndexWriter(dir, iwc);
    try {
        QueryParser parser = new QueryParser(Values.CONTENTS, analyzer);
        Query query = parser.parse(Values.CODEID + ":" + QueryParser.escape(codeId));
        writer.deleteDocuments(query);
    } catch (Exception ex) {
        Singleton.getLogger().warning("ERROR - caught a " + ex.getClass() + " in CodeIndexer\n with message: " + ex.getMessage());
    } finally {
        writer.close();
    }
}

Also used : CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) IndexWriter(org.apache.lucene.index.IndexWriter) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) IOException(java.io.IOException) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

CodeAnalyzer (com.searchcode.app.util.CodeAnalyzer)7 Analyzer (org.apache.lucene.analysis.Analyzer)7 IOException (java.io.IOException)4 IndexWriter (org.apache.lucene.index.IndexWriter)4 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)4 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)4 Query (org.apache.lucene.search.Query)4 Directory (org.apache.lucene.store.Directory)4 FSDirectory (org.apache.lucene.store.FSDirectory)4 IndexReader (org.apache.lucene.index.IndexReader)3 Term (org.apache.lucene.index.Term)3 IndexSearcher (org.apache.lucene.search.IndexSearcher)3 CodeIndexDocument (com.searchcode.app.dto.CodeIndexDocument)2 ArrayList (java.util.ArrayList)2 Document (org.apache.lucene.document.Document)2 FacetsConfig (org.apache.lucene.facet.FacetsConfig)2 TaxonomyWriter (org.apache.lucene.facet.taxonomy.TaxonomyWriter)2 DirectoryTaxonomyWriter (org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter)2 ScoreDoc (org.apache.lucene.search.ScoreDoc)2 TopDocs (org.apache.lucene.search.TopDocs)2