use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.
the class SimilarWordFinder method findSimilarWords.
private void findSimilarWords(File indexDir) throws IOException {
FSDirectory dir = FSDirectory.open(indexDir.toPath());
try (DirectoryReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = new IndexSearcher(reader);
for (int i = 0; i < reader.maxDoc(); i++) {
Document doc = reader.document(i);
String word = doc.get("word");
//System.out.println(word);
findSimilarWordsTo(reader, searcher, word);
}
}
}
use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.
the class Indexer method run.
private static void run(String textFile, String indexDir, String languageCode) throws IOException {
File file = new File(textFile);
if (!file.exists() || !file.canRead()) {
System.out.println("Text file '" + file.getAbsolutePath() + "' does not exist or is not readable, please check the path");
System.exit(1);
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
System.out.println("Indexing to directory '" + indexDir + "'...");
try (FSDirectory directory = FSDirectory.open(new File(indexDir).toPath())) {
Language language = Languages.getLanguageForShortCode(languageCode);
try (Indexer indexer = new Indexer(directory, language)) {
indexer.indexText(reader);
}
}
}
System.out.println("Index complete!");
}
use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.
the class StartTokenCounter method main.
public static void main(String[] args) throws IOException {
long totalCount = 0;
File dir = new File("/data/google-ngram-index/en/2grams");
try (FSDirectory directory = FSDirectory.open(dir.toPath());
IndexReader reader = DirectoryReader.open(directory)) {
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
Terms ngrams = fields.terms("ngram");
TermsEnum iterator = ngrams.iterator();
BytesRef next;
int i = 0;
while ((next = iterator.next()) != null) {
String term = next.utf8ToString();
if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
//System.out.println("ignore: " + term);
continue;
}
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
if (topDocs.totalHits == 0) {
throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
} else if (topDocs.totalHits == 1) {
int docId = topDocs.scoreDocs[0].doc;
Document document = reader.document(docId);
Long count = Long.parseLong(document.get("count"));
//System.out.println(term + " -> " + count);
totalCount += count;
if (++i % 10_000 == 0) {
System.out.println(i + " ... " + totalCount);
}
} else {
throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
}
}
}
}
System.out.println("==> " + totalCount);
}
use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.
the class LargestNGramFinder method main.
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.out.println("Usage: " + LargestNGramFinder.class.getSimpleName() + " <ngramIndexDir>");
System.exit(1);
}
FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
long max = 0;
String maxTerm = "";
Terms terms = fields.terms("ngram");
TermsEnum termsEnum = terms.iterator();
int count = 0;
BytesRef next;
while ((next = termsEnum.next()) != null) {
String term = next.utf8ToString();
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5);
int docId = topDocs.scoreDocs[0].doc;
Document document = reader.document(docId);
long thisCount = Long.parseLong(document.get("count"));
if (max < thisCount) {
max = thisCount;
maxTerm = term;
}
if (count % 10_000 == 0) {
System.out.println(count + " -> " + topDocs.totalHits + " for " + term + " -> " + thisCount + ", max so far: " + max + " for '" + maxTerm + "'");
}
count++;
}
System.out.println("Max: " + max + " for " + maxTerm);
}
use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.
the class TextIndexCreator method index.
private void index(File outputDir, String[] inputFiles) throws IOException {
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
try (FSDirectory directory = FSDirectory.open(outputDir.toPath());
IndexWriter indexWriter = new IndexWriter(directory, config)) {
for (String inputFile : inputFiles) {
indexFile(indexWriter, inputFile);
}
}
}
Aggregations