Examples with Analyzer - org.apache.lucene.analysis.Analyzer

Example 41 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project titan by thinkaurelius.

the class LuceneExample method example1.

@Test
public void example1() throws Exception {
    Directory dir = FSDirectory.open(path);
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_4, analyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    IndexWriter writer = new IndexWriter(dir, iwc);
    indexDocs(writer, "doc1", ImmutableMap.of("name", "The laborious work of John Doe as we know it", "city", "Blumenkamp", "location", Geoshape.point(51.687882, 6.612053), "time", 1000342034));
    indexDocs(writer, "doc2", ImmutableMap.of("name", "Life as we know it or not", "city", "Essen", "location", Geoshape.point(51.787882, 6.712053), "time", 1000342034 - 500));
    indexDocs(writer, "doc3", ImmutableMap.of("name", "Berlin - poor but sexy and a display of the extraordinary", "city", "Berlin", "location", Geoshape.circle(52.509535, 13.425293, 50), "time", 1000342034 + 2000));
    writer.close();
    //Search
    IndexReader reader = DirectoryReader.open(FSDirectory.open(path));
    IndexSearcher searcher = new IndexSearcher(reader);
    analyzer = new StandardAnalyzer();
    //Auesee
    BooleanFilter filter = new BooleanFilter();
    //filter.add(new TermsFilter(new Term("name_txt","know")), BooleanClause.Occur.MUST);
    SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects, Geoshape.circle(51.666167, 6.58905, 450).convert2Spatial4j());
    //filter.add(getSpatialStrategy("location").makeFilter(args), BooleanClause.Occur.MUST);
    filter.add(NumericRangeFilter.newLongRange("time", (long) 1000342034, (long) 1000342034, true, true), BooleanClause.Occur.MUST);
    //        filter.add(NumericRangeFilter.newLongRange("time",(long)1000342034-100,Long.MAX_VALUE,true,true), BooleanClause.Occur.MUST);
    //        filter.add(NumericRangeFilter.newLongRange("time",Long.MIN_VALUE,(long)1000342034+300,true,true), BooleanClause.Occur.MUST);
    filter.add(new PrefixFilter(new Term("city_str", "B")), BooleanClause.Occur.MUST);
    TopDocs docs = searcher.search(new MatchAllDocsQuery(), filter, MAX_RESULT);
    if (docs.totalHits >= MAX_RESULT)
        throw new RuntimeException("Max results exceeded: " + MAX_RESULT);
    Set<String> result = getResults(searcher, docs);
    System.out.println(result);
}

Also used : BooleanFilter(org.apache.lucene.queries.BooleanFilter) SpatialArgs(org.apache.lucene.spatial.query.SpatialArgs) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Example 42 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project stargate-core by tuplejump.

the class CassandraUtils method getOptions.

public static Options getOptions(Properties mapping, ColumnFamilyStore baseCfs, String colName) {
    Map<String, NumericConfig> numericFieldOptions = new HashMap<>();
    Map<String, FieldType> fieldDocValueTypes = new TreeMap<>();
    Map<String, FieldType> collectionFieldDocValueTypes = new TreeMap<>();
    Map<String, FieldType> fieldTypes = new TreeMap<>();
    Map<String, FieldType[]> collectionFieldTypes = new TreeMap<>();
    Map<String, ColumnDefinition> validators = new TreeMap<>();
    Map<String, ColumnDefinition> clusteringKeysIndexed = new LinkedHashMap<>();
    Map<String, ColumnDefinition> partitionKeysIndexed = new LinkedHashMap<>();
    Set<String> indexedColumnNames;
    //getForRow all the fields options.
    indexedColumnNames = new TreeSet<>();
    indexedColumnNames.addAll(mapping.getFields().keySet());
    Set<String> added = new HashSet<>(indexedColumnNames.size());
    List<ColumnDefinition> partitionKeys = baseCfs.metadata.partitionKeyColumns();
    List<ColumnDefinition> clusteringKeys = baseCfs.metadata.clusteringColumns();
    for (ColumnDefinition colDef : partitionKeys) {
        String columnName = colDef.name.toString();
        if (Options.logger.isDebugEnabled()) {
            Options.logger.debug("Partition key name is {} and index is {}", colName, colDef.position());
        }
        validators.put(columnName, colDef);
        if (indexedColumnNames.contains(columnName)) {
            partitionKeysIndexed.put(colName, colDef);
            addPropertiesAndFieldType(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, added, colDef, columnName);
        }
    }
    for (ColumnDefinition colDef : clusteringKeys) {
        String columnName = colDef.name.toString();
        if (Options.logger.isDebugEnabled()) {
            Options.logger.debug("Clustering key name is {} and index is {}", colName, colDef.position() + 1);
        }
        validators.put(columnName, colDef);
        if (indexedColumnNames.contains(columnName)) {
            clusteringKeysIndexed.put(columnName, colDef);
            addPropertiesAndFieldType(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, added, colDef, columnName);
        }
    }
    for (String columnName : indexedColumnNames) {
        if (added.add(columnName.toLowerCase())) {
            Properties options = mapping.getFields().get(columnName);
            ColumnDefinition colDef = getColumnDefinition(baseCfs, columnName);
            if (colDef != null) {
                validators.put(columnName, colDef);
                addFieldType(columnName, colDef.type, options, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes);
            } else {
                throw new IllegalArgumentException(String.format("Column Definition for %s not found", columnName));
            }
            if (options.getType() == Type.object) {
                mapping.getFields().putAll(options.getFields());
            }
        }
    }
    Set<ColumnDefinition> otherColumns = baseCfs.metadata.regularColumns();
    for (ColumnDefinition colDef : otherColumns) {
        String columnName = UTF8Type.instance.getString(colDef.name.bytes);
        validators.put(columnName, colDef);
    }
    numericFieldOptions.putAll(mapping.getDynamicNumericConfig());
    Analyzer defaultAnalyzer = mapping.getLuceneAnalyzer();
    Analyzer analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, mapping.perFieldAnalyzers());
    Map<String, Type> types = new TreeMap<>();
    Set<String> nestedFields = new TreeSet<>();
    for (Map.Entry<String, ColumnDefinition> entry : validators.entrySet()) {
        CQL3Type cql3Type = entry.getValue().type.asCQL3Type();
        AbstractType inner = getValueValidator(cql3Type.getType());
        if (cql3Type.isCollection()) {
            types.put(entry.getKey(), fromAbstractType(inner.asCQL3Type()));
            nestedFields.add(entry.getKey());
        } else {
            types.put(entry.getKey(), fromAbstractType(cql3Type));
        }
    }
    return new Options(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, types, nestedFields, clusteringKeysIndexed, partitionKeysIndexed, indexedColumnNames, analyzer, colName);
}

Also used : CQL3Type(org.apache.cassandra.cql3.CQL3Type) Options(com.tuplejump.stargate.lucene.Options) Properties(com.tuplejump.stargate.lucene.Properties) Analyzer(org.apache.lucene.analysis.Analyzer) FieldType(org.apache.lucene.document.FieldType) ColumnDefinition(org.apache.cassandra.config.ColumnDefinition) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) Type(com.tuplejump.stargate.lucene.Type) FieldType(org.apache.lucene.document.FieldType) CQL3Type(org.apache.cassandra.cql3.CQL3Type) NumericConfig(org.apache.lucene.queryparser.flexible.standard.config.NumericConfig)

Example 43 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project stargate-core by tuplejump.

the class PhraseCondition method query.

/**
     * {@inheritDoc}
     */
@Override
public Query query(Options schema) {
    if (field == null || field.trim().isEmpty()) {
        throw new IllegalArgumentException("Field name required");
    }
    if (values == null) {
        throw new IllegalArgumentException("Field values required");
    }
    if (slop == null) {
        throw new IllegalArgumentException("Slop required");
    }
    if (slop < 0) {
        throw new IllegalArgumentException("Slop must be positive");
    }
    Properties properties = schema.getProperties(field);
    Type fieldType = properties != null ? properties.getType() : Type.text;
    if (fieldType.isCharSeq()) {
        Analyzer analyzer = schema.analyzer;
        PhraseQuery.Builder query = new PhraseQuery.Builder();
        query.setSlop(slop);
        int count = 0;
        for (String value : values) {
            if (value != null) {
                String analyzedValue = analyze(field, value, analyzer);
                if (analyzedValue != null) {
                    Term term = new Term(field, analyzedValue);
                    query.add(term, count);
                }
            }
            count++;
        }
        return query.build();
    }
    String message = String.format("Phrase queries cannot be supported until mapping is defined");
    throw new UnsupportedOperationException(message);
}

Also used : Type(com.tuplejump.stargate.lucene.Type) PhraseQuery(org.apache.lucene.search.PhraseQuery) Term(org.apache.lucene.index.Term) Properties(com.tuplejump.stargate.lucene.Properties) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 44 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project textdb by TextDB.

the class RelationManager method createTable.

/**
     * Creates a new table. 
     *   Table name must be unique (case insensitive).
     *   LuceneAnalyzer must be a valid analyzer string.
     * 
     * The "_id" attribute will be added to the table schema.
     * System automatically generates a unique ID for each tuple inserted to a table,
     *   the generated ID will be in "_id" field.
     * 
     * @param tableName, the name of the table, must be unique, case is not sensitive
     * @param indexDirectory, the directory to store the index and data, must not duplicate with other tables' directories
     * @param schema, the schema of the table
     * @param luceneAnalyzerString, the string representing the lucene analyzer used
     * @throws StorageException
     */
public void createTable(String tableName, String indexDirectory, Schema schema, String luceneAnalyzerString) throws StorageException {
    // convert the table name to lower case
    tableName = tableName.toLowerCase();
    // table should not exist
    if (checkTableExistence(tableName)) {
        throw new StorageException(String.format("Table %s already exists.", tableName));
    }
    // and convert the index directory to its absolute path
    try {
        Path indexPath = Paths.get(indexDirectory);
        if (Files.notExists(indexPath)) {
            Files.createDirectories(indexPath);
        }
        indexDirectory = indexPath.toRealPath().toString();
    } catch (IOException e) {
        throw new StorageException(e);
    }
    // check if the indexDirectory overlaps with another table's index directory
    Query indexDirectoryQuery = new TermQuery(new Term(CatalogConstants.TABLE_DIRECTORY, indexDirectory));
    DataReader tableCatalogDataReader = new DataReader(CatalogConstants.TABLE_CATALOG_DATASTORE, indexDirectoryQuery);
    tableCatalogDataReader.setPayloadAdded(false);
    tableCatalogDataReader.open();
    Tuple nextTuple = tableCatalogDataReader.getNextTuple();
    tableCatalogDataReader.close();
    // if the index directory is already taken by another table, throws an exception
    if (nextTuple != null) {
        String overlapTableName = nextTuple.getField(CatalogConstants.TABLE_NAME).getValue().toString();
        throw new StorageException(String.format("Table %s already takes the index directory %s. Please choose another directory.", overlapTableName, indexDirectory));
    }
    // check if the lucene analyzer string is valid
    Analyzer luceneAnalyzer = null;
    try {
        luceneAnalyzer = LuceneAnalyzerConstants.getLuceneAnalyzer(luceneAnalyzerString);
    } catch (DataFlowException e) {
        throw new StorageException("Lucene Analyzer String is not valid.");
    }
    // create the directory and clear all data in the index directory
    Schema tableSchema = Utils.getSchemaWithID(schema);
    DataStore tableDataStore = new DataStore(indexDirectory, tableSchema);
    DataWriter dataWriter = new DataWriter(tableDataStore, luceneAnalyzer);
    dataWriter.open();
    dataWriter.clearData();
    dataWriter.close();
    // write table info to catalog
    writeTableInfoToCatalog(tableName, indexDirectory, schema, luceneAnalyzerString);
}

Also used : Path(java.nio.file.Path) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) Schema(edu.uci.ics.textdb.api.schema.Schema) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) Analyzer(org.apache.lucene.analysis.Analyzer) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) StorageException(edu.uci.ics.textdb.api.exception.StorageException) Tuple(edu.uci.ics.textdb.api.tuple.Tuple)

Example 45 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project textdb by TextDB.

the class RelationManagerTest method test1.

/*
     * Test the information about "table catalog" itself is stored properly.
     * 
     */
@Test
public void test1() throws Exception {
    String tableCatalogDirectory = relationManager.getTableDirectory(CatalogConstants.TABLE_CATALOG);
    Analyzer tableCatalogLuceneAnalyzer = relationManager.getTableAnalyzer(CatalogConstants.TABLE_CATALOG);
    Schema tableCatalogSchema = relationManager.getTableSchema(CatalogConstants.TABLE_CATALOG);
    Assert.assertEquals(tableCatalogDirectory, new File(CatalogConstants.TABLE_CATALOG_DIRECTORY).getCanonicalPath());
    Assert.assertTrue(tableCatalogLuceneAnalyzer instanceof StandardAnalyzer);
    Assert.assertEquals(tableCatalogSchema, Utils.getSchemaWithID(CatalogConstants.TABLE_CATALOG_SCHEMA));
}

Also used : Schema(edu.uci.ics.textdb.api.schema.Schema) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) File(java.io.File) Test(org.junit.Test)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55