use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.
the class CodeIndexer method indexTimeDocuments.
/**
* Given a queue of documents to index, index them by popping the queue limited to 1000 items.
* This method must be synchronized as we have not added any logic to deal with multiple threads writing to the
* index.
* TODO investigate how Lucene deals with multiple writes
*/
public synchronized void indexTimeDocuments(Queue<CodeIndexDocument> codeIndexDocumentQueue) throws IOException {
// Index all documents and commit at the end for performance gains
Directory dir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXLOCATION, Values.DEFAULTTIMEINDEXLOCATION)));
Directory facetsdir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.TIMEINDEXFACETLOCATION, Values.DEFAULTTIMEINDEXFACETLOCATION)));
Analyzer analyzer = new CodeAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
FacetsConfig facetsConfig;
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(dir, iwc);
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(facetsdir);
try {
CodeIndexDocument codeIndexDocument = codeIndexDocumentQueue.poll();
int count = 0;
while (codeIndexDocument != null) {
Singleton.getLogger().info("Indexing time file " + codeIndexDocument.getRepoLocationRepoNameLocationFilename());
this.sharedService.decrementCodeIndexLinesCount(codeIndexDocument.getCodeLines());
Document doc = new Document();
// Path is the primary key for documents
// needs to include repo location, project name and then filepath including file and revision
Field pathField = new StringField("path", codeIndexDocument.getRepoLocationRepoNameLocationFilename() + ":" + codeIndexDocument.getRevision(), Field.Store.YES);
doc.add(pathField);
// Add in facets
facetsConfig = new FacetsConfig();
facetsConfig.setIndexFieldName(Values.LANGUAGENAME, Values.LANGUAGENAME);
facetsConfig.setIndexFieldName(Values.REPONAME, Values.REPONAME);
facetsConfig.setIndexFieldName(Values.CODEOWNER, Values.CODEOWNER);
facetsConfig.setIndexFieldName(Values.DATEYEARMONTHDAY, Values.DATEYEARMONTHDAY);
facetsConfig.setIndexFieldName(Values.DATEYEARMONTH, Values.DATEYEARMONTH);
facetsConfig.setIndexFieldName(Values.DATEYEAR, Values.DATEYEAR);
facetsConfig.setIndexFieldName(Values.REVISION, Values.REVISION);
facetsConfig.setIndexFieldName(Values.DELETED, Values.DELETED);
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getLanguageName()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRepoName()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.REPONAME, codeIndexDocument.getRepoName()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getCodeOwner()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.CODEOWNER, codeIndexDocument.getCodeOwner()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonthDay().substring(0, 6)));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getYearMonthDay()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DATEYEAR, codeIndexDocument.getYearMonthDay().substring(0, 4)));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.getRevision()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.REVISION, codeIndexDocument.getRevision()));
}
if (Singleton.getHelpers().isNullEmptyOrWhitespace(codeIndexDocument.isDeleted()) == false) {
doc.add(new SortedSetDocValuesFacetField(Values.DELETED, codeIndexDocument.isDeleted()));
}
String indexContents = Values.EMPTYSTRING;
indexContents += this.searchcodeLib.splitKeywords(codeIndexDocument.getContents());
indexContents += this.searchcodeLib.codeCleanPipeline(codeIndexDocument.getContents());
// Store in spelling corrector
this.searchcodeLib.addToSpellingCorrector(codeIndexDocument.getContents());
indexContents = indexContents.toLowerCase();
doc.add(new TextField(Values.REPONAME, codeIndexDocument.getRepoName(), Field.Store.YES));
doc.add(new TextField(Values.FILENAME, codeIndexDocument.getFileName(), Field.Store.YES));
doc.add(new TextField(Values.FILELOCATION, codeIndexDocument.getFileLocation(), Field.Store.YES));
doc.add(new TextField(Values.FILELOCATIONFILENAME, codeIndexDocument.getFileLocationFilename(), Field.Store.YES));
doc.add(new TextField(Values.MD5HASH, codeIndexDocument.getMd5hash(), Field.Store.YES));
doc.add(new TextField(Values.LANGUAGENAME, codeIndexDocument.getLanguageName(), Field.Store.YES));
doc.add(new IntField(Values.CODELINES, codeIndexDocument.getCodeLines(), Field.Store.YES));
doc.add(new TextField(Values.CONTENTS, indexContents, Field.Store.NO));
doc.add(new TextField(Values.REPOLOCATION, codeIndexDocument.getRepoRemoteLocation(), Field.Store.YES));
doc.add(new TextField(Values.CODEOWNER, codeIndexDocument.getCodeOwner(), Field.Store.YES));
doc.add(new TextField(Values.REVISION, codeIndexDocument.getRevision(), Field.Store.YES));
doc.add(new TextField(Values.DATEYEARMONTHDAY, codeIndexDocument.getYearMonthDay(), Field.Store.YES));
doc.add(new TextField(Values.DATEYEARMONTH, codeIndexDocument.getYearMonth(), Field.Store.YES));
doc.add(new TextField(Values.DATEYEAR, codeIndexDocument.getYear(), Field.Store.YES));
doc.add(new TextField(Values.MESSAGE, codeIndexDocument.getMessage(), Field.Store.YES));
doc.add(new TextField(Values.DELETED, codeIndexDocument.isDeleted(), Field.Store.YES));
// Extra metadata in this case when it was last indexed
doc.add(new LongField(Values.MODIFIED, new Date().getTime(), Field.Store.YES));
writer.updateDocument(new Term(Values.PATH, codeIndexDocument.getRepoLocationRepoNameLocationFilename()), facetsConfig.build(taxoWriter, doc));
count++;
if (count >= INDEX_QUEUE_BATCH_SIZE) {
codeIndexDocument = null;
} else {
codeIndexDocument = codeIndexDocumentQueue.poll();
}
}
} finally {
Singleton.getLogger().info("Closing writers");
writer.close();
taxoWriter.close();
}
}
use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.
the class TimeCodeSearcher method getRepoDocuments.
public List<String> getRepoDocuments(String repoName) {
List<String> fileLocations = new ArrayList<>();
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new CodeAnalyzer();
QueryParser parser = new QueryParser(CODEFIELD, analyzer);
Query query = parser.parse(Values.REPONAME + ":" + repoName);
TopDocs results = searcher.search(query, Integer.MAX_VALUE);
ScoreDoc[] hits = results.scoreDocs;
for (int i = 0; i < hits.length; i++) {
Document doc = searcher.doc(hits[i].doc);
fileLocations.add(doc.get(Values.FILELOCATIONFILENAME));
}
reader.close();
} catch (Exception ex) {
LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
}
return fileLocations;
}
use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.
the class TimeCodeSearcher method getByRepoFileName.
/**
* Attempts to find a unique file given the repository name and the path/filename however
* it seems to randomly not find things for some files. No idea of the root cause at this point and have implemented
* a work around where we get the file by getById which is no ideal. The bug appears to be due to some issue
* inside lucene itself as using raw queries to pull back the file results in no matches, and yet it does appear
* when not limiting to the repo
* TODO investigate the lucene issue that occurs here mentioned above
* TODO needs to use the revision number here as well to get the right value
*/
public CodeResult getByRepoFileName(String repo, String fileName) {
CodeResult codeResult = null;
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new CodeAnalyzer();
QueryParser parser = new QueryParser(CODEFIELD, analyzer);
// TODO I have a feeling this may not be unique if there are to files in the same directory with different case... something to investigate
Query query = parser.parse(Values.FILELOCATIONFILENAME + ":" + QueryParser.escape(repo + "/" + fileName));
Singleton.getLogger().info("Query to get by filename = " + Values.FILELOCATIONFILENAME + ":" + QueryParser.escape(repo + "/" + fileName));
TopDocs results = searcher.search(query, 1);
ScoreDoc[] hits = results.scoreDocs;
if (hits.length != 0) {
Document doc = searcher.doc(hits[0].doc);
String filepath = doc.get(Values.PATH);
List<String> code = new ArrayList<>();
try {
code = Files.readAllLines(Paths.get(filepath), StandardCharsets.UTF_8);
code = Singleton.getHelpers().readFileLinesGuessEncoding(filepath, Singleton.getHelpers().tryParseInt(Properties.getProperties().getProperty(Values.MAXFILELINEDEPTH, Values.DEFAULTMAXFILELINEDEPTH), Values.DEFAULTMAXFILELINEDEPTH));
} catch (Exception ex) {
Singleton.getLogger().info("Indexed file appears to binary: " + filepath);
}
codeResult = new CodeResult(code, null);
codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME));
codeResult.setFileName(doc.get(Values.FILENAME));
codeResult.setLanguageName(doc.get(Values.LANGUAGENAME));
codeResult.setMd5hash(doc.get(Values.MD5HASH));
codeResult.setCodeLines(doc.get(Values.CODELINES));
codeResult.setDocumentId(hits[0].doc);
codeResult.setRepoName(doc.get(Values.REPONAME));
codeResult.setRepoLocation(doc.get(Values.REPOLOCATION));
codeResult.setCodeOwner(doc.get(Values.CODEOWNER));
}
reader.close();
} catch (Exception ex) {
LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
}
return codeResult;
}
use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.
the class TimeCodeSearcher method search.
/**
* Given a query and what page of results we are on return the matching results for that search
*/
public SearchResult search(String queryString, int page) {
SearchResult searchResult = new SearchResult();
statsService.incrementSearchCount();
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new CodeAnalyzer();
QueryParser parser = new QueryParser(CODEFIELD, analyzer);
Query query = parser.parse(queryString);
LOGGER.info("Searching for: " + query.toString(CODEFIELD));
searchResult = this.doPagingSearch(reader, searcher, query, page);
reader.close();
} catch (Exception ex) {
// LOGGER.warning(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
}
return searchResult;
}
use of com.searchcode.app.util.CodeAnalyzer in project searchcode-server by boyter.
the class CodeIndexer method deleteByCodeId.
/**
* Deletes a file from the index using the code id which seems to be
* the most reliable way of doing it
* TODO Update the record and set the facets to a value we can ignore
*/
public synchronized void deleteByCodeId(String codeId) throws IOException {
Directory dir = FSDirectory.open(Paths.get(Properties.getProperties().getProperty(Values.INDEXLOCATION, Values.DEFAULTINDEXLOCATION)));
Analyzer analyzer = new CodeAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(dir, iwc);
try {
QueryParser parser = new QueryParser(Values.CONTENTS, analyzer);
Query query = parser.parse(Values.CODEID + ":" + QueryParser.escape(codeId));
writer.deleteDocuments(query);
} catch (Exception ex) {
Singleton.getLogger().warning("ERROR - caught a " + ex.getClass() + " in CodeIndexer\n with message: " + ex.getMessage());
} finally {
writer.close();
}
}
Aggregations