Search in sources :

Example 46 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class MedlineIndexWriter method recordToTuple.

public static Tuple recordToTuple(String record) throws IOException, ParseException {
    JsonNode jsonNode = new ObjectMapper().readValue(record, JsonNode.class);
    ArrayList<IField> fieldList = new ArrayList<IField>();
    for (Attribute attr : ATTRIBUTES_MEDLINE) {
        fieldList.add(StorageUtils.getField(attr.getType(), jsonNode.get(attr.getName()).toString()));
    }
    IField[] fieldArray = new IField[fieldList.size()];
    Tuple tuple = new Tuple(SCHEMA_MEDLINE, fieldList.toArray(fieldArray));
    return tuple;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) ArrayList(java.util.ArrayList) JsonNode(com.fasterxml.jackson.databind.JsonNode) IField(edu.uci.ics.texera.api.field.IField) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 47 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class NltkSentimentOperator method convertArrowVectorsToResults.

private void convertArrowVectorsToResults(VectorSchemaRoot schemaRoot) {
    List<FieldVector> fieldVectors = schemaRoot.getFieldVectors();
    Schema texeraSchema = convertToTexeraSchema(schemaRoot.getSchema());
    for (int i = 0; i < schemaRoot.getRowCount(); i++) {
        Tuple tuple;
        List<IField> texeraFields = new ArrayList<>();
        for (FieldVector vector : fieldVectors) {
            IField texeraField = null;
            try {
                switch(vector.getField().getFieldType().getType().getTypeID()) {
                    case Int:
                        // It's either IntVector or BigIntVector, but can't know because it depends on Python.
                        try {
                            texeraField = new IntegerField(((IntVector) vector).get(i));
                        } catch (ClassCastException e) {
                            texeraField = new IntegerField((int) ((BigIntVector) vector).get(i));
                        }
                        break;
                    case FloatingPoint:
                        texeraField = new DoubleField((((Float8Vector) vector).get(i)));
                        break;
                    // break;
                    case Utf8:
                        texeraField = new TextField(new String(((VarCharVector) vector).get(i), StandardCharsets.UTF_8));
                        break;
                    case Date:
                        texeraField = new DateField(new Date(((DateDayVector) vector).get(i)));
                        break;
                    case Struct:
                        // For now, struct is only for DateTime
                        DateDayVector subVectorDay = (DateDayVector) ((StructVector) vector).getChildByOrdinal(0);
                        TimeSecVector subVectorTime = (TimeSecVector) ((StructVector) vector).getChildByOrdinal(1);
                        texeraField = new DateTimeField(LocalDateTime.of(LocalDate.ofEpochDay(subVectorDay.get(i)), LocalTime.ofSecondOfDay(subVectorTime.get(i))));
                        break;
                    case List:
                        texeraField = getSpanFromListVector((ListVector) vector, i);
                        break;
                    default:
                        throw (new DataflowException("Unsupported data type " + vector.getField().toString() + " when converting back to Texera table."));
                }
            } catch (IllegalStateException e) {
                if (!e.getMessage().contains("Value at index is null")) {
                    throw new DataflowException(e);
                } else {
                    switch(vector.getField().getFieldType().getType().getTypeID()) {
                        case Int:
                            texeraField = new IntegerField(null);
                            break;
                        case FloatingPoint:
                            texeraField = new DoubleField(null);
                            break;
                        case Date:
                            texeraField = new DateField((String) null);
                            break;
                        case Struct:
                            texeraField = new DateTimeField((String) null);
                            break;
                        case List:
                            texeraField = new ListField<Span>(null);
                        default:
                            break;
                    }
                }
            }
            texeraFields.add(texeraField);
        }
        tuple = new Tuple(texeraSchema, texeraFields);
        resultQueue.add(tuple);
    }
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) LocalDate(java.time.LocalDate) ListVector(org.apache.arrow.vector.complex.ListVector) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 48 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class ProjectionOperatorTest method testProjection1.

@Test
public void testProjection1() throws Exception {
    List<String> projectionFields = Arrays.asList(TestConstants.DESCRIPTION);
    Schema projectionSchema = new Schema(TestConstants.DESCRIPTION_ATTR);
    IField[] fields1 = { new TextField("Tall Angry") };
    IField[] fields2 = { new TextField("Short Brown") };
    IField[] fields3 = { new TextField("White Angry") };
    IField[] fields4 = { new TextField("Lin Clooney is Short and lin clooney is Angry") };
    IField[] fields5 = { new TextField("Tall Fair") };
    IField[] fields6 = { new TextField("Short angry") };
    Tuple tuple1 = new Tuple(projectionSchema, fields1);
    Tuple tuple2 = new Tuple(projectionSchema, fields2);
    Tuple tuple3 = new Tuple(projectionSchema, fields3);
    Tuple tuple4 = new Tuple(projectionSchema, fields4);
    Tuple tuple5 = new Tuple(projectionSchema, fields5);
    Tuple tuple6 = new Tuple(projectionSchema, fields6);
    List<Tuple> expectedResults = Arrays.asList(tuple1, tuple2, tuple3, tuple4, tuple5, tuple6);
    List<Tuple> returnedResults = getProjectionResults(new ScanBasedSourceOperator(new ScanSourcePredicate(PEOPLE_TABLE)), projectionFields);
    Assert.assertTrue(TestUtils.equals(expectedResults, returnedResults));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) TextField(edu.uci.ics.texera.api.field.TextField) IField(edu.uci.ics.texera.api.field.IField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) ScanBasedSourceOperator(edu.uci.ics.texera.dataflow.source.scan.ScanBasedSourceOperator) ScanSourcePredicate(edu.uci.ics.texera.dataflow.source.scan.ScanSourcePredicate) Test(org.junit.Test)

Example 49 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class RegexMatcherTest method testRegexText2.

@Test
public void testRegexText2() throws Exception {
    String query = "follow(-| )?up";
    List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(TEXT_TABLE, query, Arrays.asList(RegexTestConstantsText.CONTENT));
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    // expected to match "followup"
    List<Tuple> data = RegexTestConstantsText.getSampleTextTuples();
    Schema spanSchema = new Schema.Builder().add(RegexTestConstantsText.SCHEMA_TEXT).add(RESULTS, AttributeType.LIST).build();
    List<Span> spans = new ArrayList<Span>();
    spans.add(new Span(RegexTestConstantsText.CONTENT, 28, 36, query, "followup"));
    spans.add(new Span(RegexTestConstantsText.CONTENT, 54, 62, query, "followup"));
    IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
    List<IField> fields = new ArrayList<IField>(data.get(4).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    // expected to match "follow up"
    spans.clear();
    spans.add(new Span(RegexTestConstantsText.CONTENT, 18, 27, query, "follow up"));
    spans.add(new Span(RegexTestConstantsText.CONTENT, 51, 60, query, "follow up"));
    spanField = new ListField<Span>(new ArrayList<Span>(spans));
    fields = new ArrayList<IField>(data.get(5).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    // expected to match "follow-up" & "followup"
    spans.clear();
    spans.add(new Span(RegexTestConstantsText.CONTENT, 24, 33, query, "follow-up"));
    spans.add(new Span(RegexTestConstantsText.CONTENT, 38, 46, query, "followup"));
    spanField = new ListField<Span>(new ArrayList<Span>(spans));
    fields = new ArrayList<IField>(data.get(6).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 50 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class RegexMatcherTest method testRegexWithLimit.

@Test
public void testRegexWithLimit() throws Exception {
    String query = "patient";
    List<Tuple> exactResultsWithLimit = RegexMatcherTestHelper.getQueryResults(TEXT_TABLE, query, Arrays.asList(RegexTestConstantsText.CONTENT), true, 2, 0);
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    List<Tuple> data = RegexTestConstantsText.getSampleTextTuples();
    Schema spanSchema = new Schema.Builder().add(RegexTestConstantsText.SCHEMA_TEXT).add(RESULTS, AttributeType.LIST).build();
    List<Span> spans = new ArrayList<Span>();
    spans.add(new Span(RegexTestConstantsText.CONTENT, 4, 11, query, "patient"));
    IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
    List<IField> fields = new ArrayList<IField>(data.get(4).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    spans.clear();
    fields.clear();
    spans.add(new Span(RegexTestConstantsText.CONTENT, 4, 11, query, "patient"));
    spans.add(new Span(RegexTestConstantsText.CONTENT, 65, 72, query, "patient"));
    spanField = new ListField<Span>(new ArrayList<Span>(spans));
    fields = new ArrayList<IField>(data.get(5).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    spans.clear();
    fields.clear();
    spans.add(new Span(RegexTestConstantsText.CONTENT, 4, 11, query, "patient"));
    spanField = new ListField<Span>(new ArrayList<Span>(spans));
    fields = new ArrayList<IField>(data.get(6).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    Assert.assertTrue(TestUtils.containsAll(expectedResults, exactResultsWithLimit));
    Assert.assertEquals(expectedResults.size(), 3);
    Assert.assertEquals(exactResultsWithLimit.size(), 2);
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Aggregations

Tuple (edu.uci.ics.texera.api.tuple.Tuple)332 ArrayList (java.util.ArrayList)191 Test (org.junit.Test)178 IField (edu.uci.ics.texera.api.field.IField)130 Schema (edu.uci.ics.texera.api.schema.Schema)126 Span (edu.uci.ics.texera.api.span.Span)100 StringField (edu.uci.ics.texera.api.field.StringField)96 Attribute (edu.uci.ics.texera.api.schema.Attribute)95 IntegerField (edu.uci.ics.texera.api.field.IntegerField)92 TextField (edu.uci.ics.texera.api.field.TextField)90 DoubleField (edu.uci.ics.texera.api.field.DoubleField)65 DateField (edu.uci.ics.texera.api.field.DateField)60 SimpleDateFormat (java.text.SimpleDateFormat)58 DataWriter (edu.uci.ics.texera.storage.DataWriter)33 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 ListField (edu.uci.ics.texera.api.field.ListField)28 TupleSourceOperator (edu.uci.ics.texera.dataflow.source.tuple.TupleSourceOperator)24 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)23 ScanBasedSourceOperator (edu.uci.ics.texera.dataflow.source.scan.ScanBasedSourceOperator)21 ScanSourcePredicate (edu.uci.ics.texera.dataflow.source.scan.ScanSourcePredicate)21