Search in sources :

Example 41 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class FileSourceOperatorTest method test2.

/*
     * Test FileSourceOperator with a Directory.
     * Optional parameters are all set to default. (only list files directly in this folder)
     * 
     * Only the files directly under this directory will be used.
     *     
     * expected results: test1.txt and test2.txt will be included.
     */
@Test
public void test2() throws Exception {
    String attrName = "content";
    Schema schema = new Schema(new Attribute(attrName, AttributeType.TEXT));
    FileSourcePredicate predicate = new FileSourcePredicate(tempFolderPath.toString(), attrName);
    FileSourceOperator fileSource = new FileSourceOperator(predicate);
    Tuple tuple;
    ArrayList<Tuple> exactResults = new ArrayList<>();
    fileSource.open();
    while ((tuple = fileSource.getNextTuple()) != null) {
        exactResults.add(tuple);
    }
    fileSource.close();
    List<Tuple> expectedResults = Arrays.asList(new Tuple(schema, new TextField(tempFile1String)), new Tuple(schema, new TextField(tempFile2String)));
    Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) TextField(edu.uci.ics.texera.api.field.TextField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 42 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class MedlineIndexWriter method recordToTuple.

public static Tuple recordToTuple(String record) throws IOException, ParseException {
    JsonNode jsonNode = new ObjectMapper().readValue(record, JsonNode.class);
    ArrayList<IField> fieldList = new ArrayList<IField>();
    for (Attribute attr : ATTRIBUTES_MEDLINE) {
        fieldList.add(StorageUtils.getField(attr.getType(), jsonNode.get(attr.getName()).toString()));
    }
    IField[] fieldArray = new IField[fieldList.size()];
    Tuple tuple = new Tuple(SCHEMA_MEDLINE, fieldList.toArray(fieldArray));
    return tuple;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) ArrayList(java.util.ArrayList) JsonNode(com.fasterxml.jackson.databind.JsonNode) IField(edu.uci.ics.texera.api.field.IField) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 43 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class DataReader method documentToFields.

private ArrayList<IField> documentToFields(Document luceneDocument) throws ParseException {
    ArrayList<IField> fields = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        AttributeType attributeType = attr.getType();
        String fieldValue = luceneDocument.get(attr.getName());
        fields.add(StorageUtils.getField(attributeType, fieldValue));
    }
    return fields;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) IField(edu.uci.ics.texera.api.field.IField)

Example 44 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class DataReader method buildPayloadFromTermVector.

private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
    ArrayList<Span> payloadSpanList = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        String attributeName = attr.getName();
        AttributeType attributeType = attr.getType();
        // payload.
        if (attributeType != AttributeType.TEXT) {
            continue;
        }
        String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
        Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
        if (termVector == null) {
            continue;
        }
        TermsEnum termsEnum = termVector.iterator();
        PostingsEnum termPostings = null;
        // go through document terms
        while ((termsEnum.next()) != null) {
            termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
            if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
                continue;
            }
            // for each term, go through its postings
            for (int i = 0; i < termPostings.freq(); i++) {
                // nextPosition needs to be called first
                int tokenPosition = termPostings.nextPosition();
                int charStart = termPostings.startOffset();
                int charEnd = termPostings.endOffset();
                String analyzedTermStr = termsEnum.term().utf8ToString();
                String originalTermStr = fieldValue.substring(charStart, charEnd);
                Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
                payloadSpanList.add(span);
            }
        }
    }
    return payloadSpanList;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) Span(edu.uci.ics.texera.api.span.Span) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 45 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class RelationManagerTest method test15.

@Test
public void test15() throws Exception {
    String tableName1 = "relation_manager_test_table_15_1";
    String tableName2 = "relation_manager_test_table_15_2";
    String indexDirectory = "./index/test_table/relation_manager_test_table_15";
    Schema schema = new Schema(new Attribute("content", AttributeType.TEXT));
    String luceneAnalyzerString = "standard";
    relationManager.deleteTable(tableName1);
    relationManager.deleteTable(tableName2);
    relationManager.createTable(tableName1, Paths.get(indexDirectory), schema, luceneAnalyzerString);
    // create another table with the same directory should fail
    try {
        relationManager.createTable(tableName2, Paths.get(indexDirectory), schema, luceneAnalyzerString);
        Assert.fail("Storage exception should be thrown because of duplicate index directories");
    } catch (StorageException e) {
    }
    relationManager.deleteTable(tableName1);
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) Schema(edu.uci.ics.texera.api.schema.Schema) StorageException(edu.uci.ics.texera.api.exception.StorageException) Test(org.junit.Test)

Aggregations

Attribute (edu.uci.ics.texera.api.schema.Attribute)98 Test (org.junit.Test)81 Tuple (edu.uci.ics.texera.api.tuple.Tuple)78 ArrayList (java.util.ArrayList)76 Schema (edu.uci.ics.texera.api.schema.Schema)75 IField (edu.uci.ics.texera.api.field.IField)60 StringField (edu.uci.ics.texera.api.field.StringField)56 TextField (edu.uci.ics.texera.api.field.TextField)56 IntegerField (edu.uci.ics.texera.api.field.IntegerField)54 DoubleField (edu.uci.ics.texera.api.field.DoubleField)53 Span (edu.uci.ics.texera.api.span.Span)51 DateField (edu.uci.ics.texera.api.field.DateField)50 SimpleDateFormat (java.text.SimpleDateFormat)47 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)28 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)9 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)8 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)6 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)6 List (java.util.List)6 Collectors (java.util.stream.Collectors)5