     * Get an indexReader for the Index database where a given file
     * @param path the file to get the database for
     * @return The index database where the file should be located or null if it
     * cannot be located.
public static IndexReader getIndexReader(String path) {
    IndexReader ret = null;
    RuntimeEnvironment env = RuntimeEnvironment.getInstance();
    File indexDir = new File(env.getDataRootFile(), INDEX_DIR);
    if (env.hasProjects()) {
        Project p = Project.getProject(path);
        if (p == null) {
            return null;
        indexDir = new File(indexDir, p.getPath());
    try {
        FSDirectory fdir =, NoLockFactory.INSTANCE);
        if (indexDir.exists() && DirectoryReader.indexExists(fdir)) {
            ret =;
    } catch (Exception ex) {
        LOGGER.log(Level.SEVERE, "Failed to open index: {0}", indexDir.getAbsolutePath());
        LOGGER.log(Level.FINE, "Stack Trace: ", ex);
    return ret;
     * Create the searcher to use wrt. to currently set parameters and the given
     * projects. Does not produce any {@link #redirect} link. It also does
     * nothing if {@link #redirect} or {@link #errorMsg} have a
     * none-{@code null} value.
     * <p>
     * Parameters which should be populated/set at this time: <ul>
     * <li>{@link #builder}</li> <li>{@link #dataRoot}</li>
     * <li>{@link #order} (falls back to relevance if unset)</li>
     * <li>{@link #parallel} (default: false)</li> </ul> Populates/sets: <ul>
     * <li>{@link #query}</li> <li>{@link #searcher}</li> <li>{@link #sort}</li>
     * <li>{@link #projects}</li> <li>{@link #errorMsg} if an error occurs</li>
     * </ul>
     * @param projects project to use query. If empty, a no-project setup
     * is assumed (i.e. DATA_ROOT/index will be used instead of possible
     * multiple DATA_ROOT/$project/index).
     * @return this instance
public SearchHelper prepareExec(SortedSet<String> projects) {
    if (redirect != null || errorMsg != null) {
        return this;
    // the Query created by the QueryBuilder
    try {
        indexDir = new File(dataRoot, IndexDatabase.INDEX_DIR);
        query =;
        if (projects == null) {
            errorMsg = "No project selected!";
            return this;
        this.projects = projects;
        if (projects.isEmpty()) {
            // no project setup
            FSDirectory dir =;
            searcher = new IndexSearcher(;
            closeOnDestroy = true;
        } else {
            // We use MultiReader even for single project. This should
            // not matter given that MultiReader is just a cheap wrapper
            // around set of IndexReader objects.
            closeOnDestroy = false;
            MultiReader multireader = RuntimeEnvironment.getInstance().getMultiReader(projects, searcherList);
            if (multireader != null) {
                searcher = new IndexSearcher(multireader);
            } else {
                errorMsg = "Failed to initialize search. Check the index.";
        // Most probably they are not reused. SearcherLifetimeManager might help here.
        switch(order) {
            case LASTMODIFIED:
                sort = new Sort(new SortField(QueryBuilder.DATE, SortField.Type.STRING, true));
            case BY_PATH:
                sort = new Sort(new SortField(QueryBuilder.FULLPATH, SortField.Type.STRING));
                sort = Sort.RELEVANCE;
        checker = new DirectSpellChecker();
    } catch (ParseException e) {
        errorMsg = PARSE_ERROR_MSG + e.getMessage();
    } catch (FileNotFoundException e) {
        //          errorMsg = "Index database(s) not found: " + e.getMessage();
        errorMsg = "Index database(s) not found.";
    } catch (IOException e) {
        errorMsg = e.getMessage();
    return this;
public static void main(String... args) throws Exception {
    if (args.length != 5) {
        System.out.println("Usage: " + SentenceSourceIndexer.class.getSimpleName() + " <dataFile...> <indexDir> <languageCode> <maxSentences> <indexPosTags>");
        System.out.println("\t<dataFiles> comma-separated list of a Wikipedia XML dump (*.xml) and/or Tatoeba files (tatoeba-*)");
        System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed");
        System.out.println("\t<languageCode> short code like en for English, de for German etc");
        System.out.println("\t<maxSentences> maximum number of sentences to be indexed, use 0 for no limit");
        System.out.println("\t<indexPosTags> 1 to also index POS tags (i.e. analyze text by LT), 0 to index only the plain text");
    List<String> dumpFilesNames = Arrays.asList(args[0].split(","));
    File indexDir = new File(args[1]);
    String languageCode = args[2];
    int maxSentences = Integer.parseInt(args[3]);
    Language language = Languages.getLanguageForShortCode(languageCode);
    if (maxSentences == 0) {
        System.out.println("Going to index contents from " + dumpFilesNames);
    } else {
        System.out.println("Going to index up to " + maxSentences + " sentences from " + dumpFilesNames);
    System.out.println("Output index dir: " + indexDir);
    long start = System.currentTimeMillis();
    Analyzer analyzer;
    String indexPos = args[4];
    if (indexPos.equals("1")) {
        // this will use LanguageToolAnalyzer
        analyzer = null;
    } else if (indexPos.equals("0")) {
        analyzer = new StandardAnalyzer(new CharArraySet(Collections.emptyList(), false));
    } else {
        throw new IllegalArgumentException("Unknown value '" + indexPos + "' for indexPosTags parameter, use 0 or 1");
    try (FSDirectory fsDirectory =;
        SentenceSourceIndexer indexer = new SentenceSourceIndexer(fsDirectory, language, maxSentences, analyzer)) {
        try {
  , language);
        } catch (DocumentLimitReachedException e) {
            System.out.println("Sentence limit (" + e.getLimit() + ") reached, stopping indexing");
        } finally {
        if (analyzer != null) {
    long end = System.currentTimeMillis();
    float minutes = (end - start) / (float) (1000 * 60);
    System.out.printf("Indexing took %.2f minutes\n", minutes);
public void createIndexDirectory(String directoryPath) throws IOException {
    FSDirectory fsDir = FSDirectory.getDirectory(directoryPath);
    IndexWriter writer = new IndexWriter(fsDir, this.analyzer, true);
private void createIndex(List<String> words, File indexDir) throws IOException {
    FSDirectory dir =;
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
    System.out.println("Creating index...");
    int docs = 0;
    try (IndexWriter writer = new IndexWriter(dir, indexWriterConfig)) {
        for (String word : words) {
            Document doc = new Document();
            doc.add(new TextField("word", word, Field.Store.YES));
    System.out.println("Index created: " + docs + " docs");
