Search in sources :

Example 26 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.

the class SimilarWordFinder method findSimilarWords.

private void findSimilarWords(File indexDir) throws IOException {
    FSDirectory dir = FSDirectory.open(indexDir.toPath());
    try (DirectoryReader reader = DirectoryReader.open(dir)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        for (int i = 0; i < reader.maxDoc(); i++) {
            Document doc = reader.document(i);
            String word = doc.get("word");
            //System.out.println(word);
            findSimilarWordsTo(reader, searcher, word);
        }
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) FSDirectory(org.apache.lucene.store.FSDirectory) Document(org.apache.lucene.document.Document)

Example 27 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.

the class Indexer method run.

private static void run(String textFile, String indexDir, String languageCode) throws IOException {
    File file = new File(textFile);
    if (!file.exists() || !file.canRead()) {
        System.out.println("Text file '" + file.getAbsolutePath() + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }
    try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
        System.out.println("Indexing to directory '" + indexDir + "'...");
        try (FSDirectory directory = FSDirectory.open(new File(indexDir).toPath())) {
            Language language = Languages.getLanguageForShortCode(languageCode);
            try (Indexer indexer = new Indexer(directory, language)) {
                indexer.indexText(reader);
            }
        }
    }
    System.out.println("Index complete!");
}
Also used : Language(org.languagetool.Language) FSDirectory(org.apache.lucene.store.FSDirectory)

Example 28 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.

the class StartTokenCounter method main.

public static void main(String[] args) throws IOException {
    long totalCount = 0;
    File dir = new File("/data/google-ngram-index/en/2grams");
    try (FSDirectory directory = FSDirectory.open(dir.toPath());
        IndexReader reader = DirectoryReader.open(directory)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        Fields fields = MultiFields.getFields(reader);
        Terms ngrams = fields.terms("ngram");
        TermsEnum iterator = ngrams.iterator();
        BytesRef next;
        int i = 0;
        while ((next = iterator.next()) != null) {
            String term = next.utf8ToString();
            if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
                if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
                    //System.out.println("ignore: " + term);
                    continue;
                }
                TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
                if (topDocs.totalHits == 0) {
                    throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
                } else if (topDocs.totalHits == 1) {
                    int docId = topDocs.scoreDocs[0].doc;
                    Document document = reader.document(docId);
                    Long count = Long.parseLong(document.get("count"));
                    //System.out.println(term + " -> " + count);
                    totalCount += count;
                    if (++i % 10_000 == 0) {
                        System.out.println(i + " ... " + totalCount);
                    }
                } else {
                    throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
                }
            }
        }
    }
    System.out.println("==> " + totalCount);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) FSDirectory(org.apache.lucene.store.FSDirectory) Document(org.apache.lucene.document.Document) TopDocs(org.apache.lucene.search.TopDocs) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef)

Example 29 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.

the class LargestNGramFinder method main.

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.out.println("Usage: " + LargestNGramFinder.class.getSimpleName() + " <ngramIndexDir>");
        System.exit(1);
    }
    FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
    IndexReader reader = DirectoryReader.open(fsDir);
    IndexSearcher searcher = new IndexSearcher(reader);
    Fields fields = MultiFields.getFields(reader);
    long max = 0;
    String maxTerm = "";
    Terms terms = fields.terms("ngram");
    TermsEnum termsEnum = terms.iterator();
    int count = 0;
    BytesRef next;
    while ((next = termsEnum.next()) != null) {
        String term = next.utf8ToString();
        TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5);
        int docId = topDocs.scoreDocs[0].doc;
        Document document = reader.document(docId);
        long thisCount = Long.parseLong(document.get("count"));
        if (max < thisCount) {
            max = thisCount;
            maxTerm = term;
        }
        if (count % 10_000 == 0) {
            System.out.println(count + " -> " + topDocs.totalHits + " for " + term + " -> " + thisCount + ", max so far: " + max + " for '" + maxTerm + "'");
        }
        count++;
    }
    System.out.println("Max: " + max + " for " + maxTerm);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) FSDirectory(org.apache.lucene.store.FSDirectory) Document(org.apache.lucene.document.Document) TopDocs(org.apache.lucene.search.TopDocs) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef)

Example 30 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.

the class TextIndexCreator method index.

private void index(File outputDir, String[] inputFiles) throws IOException {
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    try (FSDirectory directory = FSDirectory.open(outputDir.toPath());
        IndexWriter indexWriter = new IndexWriter(directory, config)) {
        for (String inputFile : inputFiles) {
            indexFile(indexWriter, inputFile);
        }
    }
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) FSDirectory(org.apache.lucene.store.FSDirectory) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

FSDirectory (org.apache.lucene.store.FSDirectory)43 File (java.io.File)18 Directory (org.apache.lucene.store.Directory)12 IOException (java.io.IOException)10 Path (java.nio.file.Path)10 IndexSearcher (org.apache.lucene.search.IndexSearcher)9 FileNotFoundException (java.io.FileNotFoundException)5 FileSystem (java.nio.file.FileSystem)5 Document (org.apache.lucene.document.Document)5 IndexReader (org.apache.lucene.index.IndexReader)5 MMapDirectory (org.apache.lucene.store.MMapDirectory)5 NIOFSDirectory (org.apache.lucene.store.NIOFSDirectory)5 FilterDirectory (org.apache.lucene.store.FilterDirectory)4 SimpleFSDirectory (org.apache.lucene.store.SimpleFSDirectory)4 PrintStream (java.io.PrintStream)3 ArrayList (java.util.ArrayList)3 DirectoryReader (org.apache.lucene.index.DirectoryReader)3 Term (org.apache.lucene.index.Term)3 WindowsFS (org.apache.lucene.mockfile.WindowsFS)3 TermQuery (org.apache.lucene.search.TermQuery)3