Search in sources :

Example 1 with CodeIndexDocument

use of com.searchcode.app.dto.CodeIndexDocument in project searchcode-server by boyter.

the class CodeIndexer method buildDocument.

/**
     * Builds a document ready to be indexed by lucene
     */
public Document buildDocument(CodeIndexDocument codeIndexDocument) {
    Document document = new Document();
    // Path is the primary key for documents
    // needs to include repo location, project name and then filepath including file
    Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename(), Field.Store.YES);
    document.add(pathField);
    if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName())) {
        document.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
    }
    if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName())) {
        document.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
    }
    if (!Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner())) {
        document.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
    }
    this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
    StringBuilder indexContents = new StringBuilder();
    indexContents.append(this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getFileName())).append(" ");
    indexContents.append(this.searchcodeLib.splitKeywords(codeIndexDocument.getFileName())).append(" ");
    indexContents.append(codeIndexDocument.getFileLocationFilename()).append(" ");
    indexContents.append(codeIndexDocument.getFileLocation());
    indexContents.append(this.searchcodeLib.splitKeywords(codeIndexDocument.getContents()));
    indexContents.append(this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents()));
    indexContents.append(this.searchcodeLib.findInterestingKeywords(codeIndexDocument.getContents()));
    indexContents.append(this.searchcodeLib.findInterestingCharacters(codeIndexDocument.getContents()));
    document.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName().replace(" ", "_"), Field.Store.YES));
    document.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
    document.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
    document.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
    document.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
    document.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName().replace(" ", "_"), Field.Store.YES));
    document.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
    document.add(new TextField(Values.CONTENTS, indexContents.toString().toLowerCase(), Field.Store.NO));
    document.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
    document.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner().replace(" ", "_"), Field.Store.YES));
    document.add(new TextField(Values.CODEID, codeIndexDocument.getHash(), Field.Store.YES));
    // Extra metadata in this case when it was last indexed
    document.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
    return document;
}
Also used : SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) Date(java.util.Date)

Example 2 with CodeIndexDocument

use of com.searchcode.app.dto.CodeIndexDocument in project searchcode-server by boyter.

the class SearchcodeFileVisitor method visitFile.

@Override
public FileVisitResult visitFile(Object file, BasicFileAttributes attrs) throws IOException {
    try {
        java.nio.file.Path filePath = (java.nio.file.Path) file;
        if (this.indexBaseRepoJob.shouldJobPauseOrTerminate()) {
            return FileVisitResult.TERMINATE;
        }
        if (Singleton.getDataService().getPersistentDelete().contains(this.repoName)) {
            return FileVisitResult.TERMINATE;
        }
        // Convert Path file to unix style that way everything is easier to reason about
        String fileParent = FilenameUtils.separatorsToUnix(filePath.getParent().toString());
        String fileToString = FilenameUtils.separatorsToUnix(filePath.toString());
        String fileName = filePath.getFileName().toString();
        if (this.indexBaseRepoJob.ignoreFile(fileParent)) {
            return FileVisitResult.CONTINUE;
        }
        // This needs to be the primary key of the file
        fileLocationsMap.put(fileToString, null);
        IndexBaseRepoJob.CodeLinesReturn codeLinesReturn = this.indexBaseRepoJob.getCodeLines(fileToString, reportList);
        if (codeLinesReturn.isError()) {
            fileLocationsMap.remove(fileToString);
            return FileVisitResult.CONTINUE;
        }
        IndexBaseRepoJob.IsMinifiedReturn isMinified = this.indexBaseRepoJob.getIsMinified(codeLinesReturn.getCodeLines(), fileName, reportList);
        if (isMinified.isMinified()) {
            return FileVisitResult.CONTINUE;
        }
        if (this.indexBaseRepoJob.checkIfEmpty(codeLinesReturn.getCodeLines(), fileName, reportList)) {
            return FileVisitResult.CONTINUE;
        }
        if (this.indexBaseRepoJob.determineBinary(fileToString, fileName, codeLinesReturn.getCodeLines(), reportList)) {
            fileLocationsMap.remove(fileToString);
            return FileVisitResult.CONTINUE;
        }
        String md5Hash = this.indexBaseRepoJob.getFileMd5(fileToString);
        String languageName = Singleton.getFileClassifier().languageGuesser(fileName, codeLinesReturn.getCodeLines());
        String fileLocation = this.indexBaseRepoJob.getRelativeToProjectPath(file.toString(), fileToString);
        String fileLocationFilename = this.indexBaseRepoJob.getFileLocationFilename(fileToString, fileRepoLocations);
        String newString = this.indexBaseRepoJob.getBlameFilePath(fileLocationFilename);
        String codeOwner = this.indexBaseRepoJob.getCodeOwner(codeLinesReturn.getCodeLines(), newString, this.repoName, fileRepoLocations, Singleton.getSearchCodeLib());
        if (this.indexBaseRepoJob.LOWMEMORY) {
            Singleton.getCodeIndexer().indexDocument(new CodeIndexDocument(fileToString, this.repoName, fileName, fileLocation, fileLocationFilename, md5Hash, languageName, codeLinesReturn.getCodeLines().size(), StringUtils.join(codeLinesReturn.getCodeLines(), " "), repoRemoteLocation, codeOwner));
        } else {
            Singleton.getSharedService().incrementCodeIndexLinesCount(codeLinesReturn.getCodeLines().size());
            Singleton.getCodeIndexQueue().add(new CodeIndexDocument(fileToString, this.repoName, fileName, fileLocation, fileLocationFilename, md5Hash, languageName, codeLinesReturn.getCodeLines().size(), StringUtils.join(codeLinesReturn.getCodeLines(), " "), repoRemoteLocation, codeOwner));
        }
        if (this.indexBaseRepoJob.LOGINDEXED) {
            reportList.add(new String[] { fileToString, "included", Values.EMPTYSTRING });
        }
    } catch (Exception ex) {
        Singleton.getLogger().warning("ERROR - caught a " + ex.getClass() + " in " + this.getClass() + " indexDocsByPath walkFileTree\n with message: " + ex.getMessage() + " for file " + file.toString() + " in path " + file + " in repo " + this.repoName);
    }
    // Continue at all costs
    return FileVisitResult.CONTINUE;
}
Also used : CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) IOException(java.io.IOException)

Example 3 with CodeIndexDocument

use of com.searchcode.app.dto.CodeIndexDocument in project searchcode-server by boyter.

the class CodeIndexer method indexTimeDocuments.

/**
     * Given a queue of documents to index, index them by popping the queue limited to 1000 items.
     * This method must be synchronized as we have not added any logic to deal with multiple threads writing to the
     * index.
     * TODO investigate how Lucene deals with multiple writes
     */
public synchronized void indexTimeDocuments(Queue<CodeIndexDocument> codeIndexDocumentQueue) throws IOException {
    // Index all documents and commit at the end for performance gains
    Directory dir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXLOCATION, Values.DEFAULTTIMEINDEXLOCATION)));
    Directory facetsdir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXFACETLOCATION, Values.DEFAULTTIMEINDEXFACETLOCATION)));
    Analyzer analyzer = new CodeAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    FacetsConfig facetsConfig;
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    IndexWriter writer = new IndexWriter(dir, iwc);
    TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(facetsdir);
    try {
        CodeIndexDocument codeIndexDocument = codeIndexDocumentQueue.poll();
        int count = 0;
        while (codeIndexDocument != null) {
            Singleton.getLogger().info("Indexing time file " + codeIndexDocument.getRepoLocationRepoNameLocationFilename());
            this.sharedService.decrementCodeIndexLinesCount(codeIndexDocument.getCodeLines());
            Document doc = new Document();
            // Path is the primary key for documents
            // needs to include repo location, project name and then filepath including file and revision
            Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename() + ":" + codeIndexDocument.getRevision(), Field.Store.YES);
            doc.add(pathField);
            // Add in facets
            facetsConfig = new FacetsConfig();
            facetsConfig.setIndexFieldName(Values.LANGUAGENAME, Values.LANGUAGENAME);
            facetsConfig.setIndexFieldName(Values.REPONAME, Values.REPONAME);
            facetsConfig.setIndexFieldName(Values.CODEOWNER, Values.CODEOWNER);
            facetsConfig.setIndexFieldName(Values.DATEYEARMONTHDAY, Values.DATEYEARMONTHDAY);
            facetsConfig.setIndexFieldName(Values.DATEYEARMONTH, Values.DATEYEARMONTH);
            facetsConfig.setIndexFieldName(Values.DATEYEAR, Values.DATEYEAR);
            facetsConfig.setIndexFieldName(Values.REVISION, Values.REVISION);
            facetsConfig.setIndexFieldName(Values.DELETED, Values.DELETED);
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonthDay().substring(0, 6)));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DATEYEAR, codeIndexDocument.getYearMonthDay().substring(0, 4)));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRevision()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.REVISION, codeIndexDocument.getRevision()));
            }
            if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.isDeleted()) == false) {
                doc.add(new SortedSetDocValuesFacetField(Values.DELETED, codeIndexDocument.isDeleted()));
            }
            String indexContents = Values.EMPTYSTRING;
            indexContents += this.searchcodeLib.splitKeywords(codeIndexDocument.getContents());
            indexContents += this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents());
            // Store in spelling corrector
            this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
            indexContents = indexContents.toLowerCase();
            doc.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName(), Field.Store.YES));
            doc.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
            doc.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
            doc.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
            doc.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
            doc.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName(), Field.Store.YES));
            doc.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
            doc.add(new TextField(Values.CONTENTS, indexContents, Field.Store.NO));
            doc.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
            doc.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner(), Field.Store.YES));
            doc.add(new TextField(Values.REVISION, codeIndexDocument.getRevision(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonth(), Field.Store.YES));
            doc.add(new TextField(Values.DATEYEAR, codeIndexDocument.getYear(), Field.Store.YES));
            doc.add(new TextField(Values.MESSAGE, codeIndexDocument.getMessage(), Field.Store.YES));
            doc.add(new TextField(Values.DELETED, codeIndexDocument.isDeleted(), Field.Store.YES));
            // Extra metadata in this case when it was last indexed
            doc.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
            writer.updateDocument(new Term(Values.PATH, codeIndexDocument.getRepoLocationRepoNameLocationFilename()), facetsConfig.build(taxoWriter, doc));
            count++;
            if (count >= INDEX_QUEUE_BATCH_SIZE) {
                codeIndexDocument = null;
            } else {
                codeIndexDocument = codeIndexDocumentQueue.poll();
            }
        }
    } finally {
        Singleton.getLogger().info("Closing writers");
        writer.close();
        taxoWriter.close();
    }
}
Also used : FacetsConfig(org.apache.lucene.facet.FacetsConfig) Term(org.apache.lucene.index.Term) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) Date(java.util.Date) DirectoryTaxonomyWriter(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter) CodeAnalyzer(com.searchcode.app.util.CodeAnalyzer) TaxonomyWriter(org.apache.lucene.facet.taxonomy.TaxonomyWriter) DirectoryTaxonomyWriter(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) IndexWriter(org.apache.lucene.index.IndexWriter) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) SortedSetDocValuesFacetField(org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 4 with CodeIndexDocument

use of com.searchcode.app.dto.CodeIndexDocument in project searchcode-server by boyter.

the class IndexGitHistoryJob method getRevisionChanges.

public void getRevisionChanges(Repository localRepository, Git git, GitChangeSet oldRevison, GitChangeSet newRevision) throws IOException, GitAPIException {
    ObjectId oldHead = localRepository.resolve(oldRevison.getRevision() + "^{tree}");
    ObjectId newHead = localRepository.resolve(newRevision.getRevision() + "^{tree}");
    ObjectReader reader = localRepository.newObjectReader();
    CanonicalTreeParser oldTreeIter = new CanonicalTreeParser();
    oldTreeIter.reset(reader, oldHead);
    CanonicalTreeParser newTreeIter = new CanonicalTreeParser();
    newTreeIter.reset(reader, newHead);
    List<DiffEntry> entries = git.diff().setNewTree(newTreeIter).setOldTree(oldTreeIter).call();
    GitService gs = new GitService();
    SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
    for (DiffEntry entry : entries) {
        if ("DELETE".equals(entry.getChangeType().name())) {
            System.out.println("DEL " + entry.getOldPath());
            String contents = gs.fetchFileRevision(localRepository.getWorkTree().toString() + "/.git", oldRevison.getRevision(), entry.getOldPath());
            CodeIndexDocument cd = new CodeIndexDocument(entry.getNewPath(), "thumbor", entry.getOldPath(), entry.getOldPath(), entry.getOldPath(), "md5hash", "Java", contents.split("\\r?\\n").length, contents, "", oldRevison.getAuthor());
            cd.setRevision(oldRevison.getRevision());
            cd.setYearMonthDay(sdf.format(oldRevison.getExpiry()));
            cd.setYearMonth(cd.getYearMonthDay().substring(0, 6));
            cd.setYear(cd.getYearMonthDay().substring(0, 4));
            cd.setMessage(oldRevison.getMessage());
            cd.setDeleted("TRUE");
            Singleton.getCodeIndexer().indexTimeDocument(cd);
        } else {
            System.out.println("ADD " + entry.getNewPath());
            String contents = gs.fetchFileRevision(localRepository.getWorkTree().toString() + "/.git", newRevision.getRevision(), entry.getNewPath());
            CodeIndexDocument cd = new CodeIndexDocument(entry.getNewPath(), "thumbor", entry.getNewPath(), entry.getNewPath(), entry.getNewPath(), "md5hash", "Java", contents.split("\\r?\\n").length, contents, "", newRevision.getAuthor());
            cd.setRevision(newRevision.getRevision());
            cd.setYearMonthDay(sdf.format(oldRevison.getExpiry()));
            cd.setYearMonth(cd.getYearMonthDay().substring(0, 6));
            cd.setYear(cd.getYearMonthDay().substring(0, 4));
            cd.setMessage(newRevision.getMessage());
            cd.setDeleted("FALSE");
            Singleton.getCodeIndexer().indexTimeDocument(cd);
        }
    }
}
Also used : ObjectId(org.eclipse.jgit.lib.ObjectId) CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument) GitService(com.searchcode.app.service.GitService) ObjectReader(org.eclipse.jgit.lib.ObjectReader) SimpleDateFormat(java.text.SimpleDateFormat) CanonicalTreeParser(org.eclipse.jgit.treewalk.CanonicalTreeParser) DiffEntry(org.eclipse.jgit.diff.DiffEntry)

Example 5 with CodeIndexDocument

use of com.searchcode.app.dto.CodeIndexDocument in project searchcode-server by boyter.

the class CodeSearcherTest method testGetRepoDocuments.

// Integration Test
public void testGetRepoDocuments() throws IOException {
    CodeIndexDocument codeIndexDocument = new CodeIndexDocument("/", "testGetRepoDocuments", "/", "/", "/", "md5hash", "Java", 10, "", "/", "/");
    Singleton.getCodeIndexer().indexDocument(codeIndexDocument);
    CodeSearcher cs = new CodeSearcher();
    List<String> testGetRepoDocuments = cs.getRepoDocuments("testGetRepoDocuments", 0);
    assertThat(testGetRepoDocuments).hasSize(1);
    testGetRepoDocuments = cs.getRepoDocuments("testGetRepoDocuments", 1);
    assertThat(testGetRepoDocuments).hasSize(0);
}
Also used : CodeIndexDocument(com.searchcode.app.dto.CodeIndexDocument)

Aggregations

CodeIndexDocument (com.searchcode.app.dto.CodeIndexDocument)10 IOException (java.io.IOException)3 CodeAnalyzer (com.searchcode.app.util.CodeAnalyzer)2 Date (java.util.Date)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 FacetsConfig (org.apache.lucene.facet.FacetsConfig)2 SortedSetDocValuesFacetField (org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField)2 TaxonomyWriter (org.apache.lucene.facet.taxonomy.TaxonomyWriter)2 DirectoryTaxonomyWriter (org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)2 Term (org.apache.lucene.index.Term)2 Directory (org.apache.lucene.store.Directory)2 FSDirectory (org.apache.lucene.store.FSDirectory)2 ProjectStats (com.searchcode.app.dto.ProjectStats)1 GitService (com.searchcode.app.service.GitService)1 SearchcodeLib (com.searchcode.app.util.SearchcodeLib)1 SimpleDateFormat (java.text.SimpleDateFormat)1 Queue (java.util.Queue)1 Document (org.apache.lucene.document.Document)1