Search in sources :

Example 56 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class RegexMatcherTest method testGetNextTupleStaffEmail.

@Test
public void testGetNextTupleStaffEmail() throws Exception {
    String query = "^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$";
    List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(STAFF_TABLE, query, Arrays.asList(RegexTestConstantStaff.EMAIL));
    List<Tuple> expectedResults = new ArrayList<Tuple>();
    // expected to match "k.bocanegra@uci.edu"
    List<Tuple> data = RegexTestConstantStaff.getSampleStaffTuples();
    Schema spanSchema = new Schema.Builder().add(RegexTestConstantStaff.SCHEMA_STAFF).add(RESULTS, AttributeType.LIST).build();
    List<Span> spans = new ArrayList<Span>();
    spans.add(new Span(RegexTestConstantStaff.EMAIL, 0, 19, query, "m.bocanegra@164.com"));
    IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
    List<IField> fields = new ArrayList<IField>(data.get(0).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    // expected to match "hwangl@ics.uci.edu"
    spans.clear();
    spans.add(new Span(RegexTestConstantStaff.EMAIL, 0, 18, query, "hwangk@ske.akb.edu"));
    spanField = new ListField<Span>(new ArrayList<Span>(spans));
    fields = new ArrayList<IField>(data.get(1).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 57 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class LabeledRegexMatcherTest method testIgnoreCaseLabeledRegex.

@Test
public void testIgnoreCaseLabeledRegex() throws Exception {
    String query = "<NAME>";
    String keywordQuery = "george lin lin";
    List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, keywordQuery, Arrays.asList(TestConstants.FIRST_NAME), "name", false, Integer.MAX_VALUE, 0);
    List<Tuple> expectedResults = new ArrayList<>();
    // expected to match "george lin lin"
    List<Tuple> data = TestConstants.getSamplePeopleTuples();
    Schema spanSchema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS, AttributeType.LIST).build();
    List<Span> spans = new ArrayList<>();
    spans.add(new Span(TestConstants.FIRST_NAME, 0, 14, query, "george lin lin"));
    IField spanField = new ListField<>(new ArrayList<>(spans));
    List<IField> fields = new ArrayList<>(data.get(3).getFields());
    fields.add(spanField);
    expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
    List<String> attributeNames = new ArrayList<>();
    attributeNames.add(RESULTS);
    Assert.assertTrue(TestUtils.attributeEquals(expectedResults, exactResults, attributeNames));
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) ListField(edu.uci.ics.texera.api.field.ListField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 58 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class SampleExtraction method parsePromedHTML.

public static Tuple parsePromedHTML(String fileName, String content) {
    try {
        Document parsedDocument = Jsoup.parse(content);
        String mainText = parsedDocument.getElementById("preview").text();
        Tuple tuple = new Tuple(PromedSchema.PROMED_SCHEMA, new StringField(fileName), new TextField(mainText));
        return tuple;
    } catch (Exception e) {
        return null;
    }
}
Also used : StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) Document(org.jsoup.nodes.Document) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 59 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class SampleExtraction method writeSampleIndex.

public static void writeSampleIndex() throws Exception {
    // parse the original file
    File sourceFileFolder = new File(promedFilesDirectory);
    ArrayList<Tuple> fileTuples = new ArrayList<>();
    for (File htmlFile : sourceFileFolder.listFiles()) {
        StringBuilder sb = new StringBuilder();
        Scanner scanner = new Scanner(htmlFile);
        while (scanner.hasNext()) {
            sb.append(scanner.nextLine());
        }
        scanner.close();
        Tuple tuple = parsePromedHTML(htmlFile.getName(), sb.toString());
        if (tuple != null) {
            fileTuples.add(tuple);
        }
    }
    // write tuples into the table
    RelationManager relationManager = RelationManager.getInstance();
    relationManager.deleteTable(PROMED_SAMPLE_TABLE);
    relationManager.createTable(PROMED_SAMPLE_TABLE, Paths.get(promedIndexDirectory), PromedSchema.PROMED_SCHEMA, LuceneAnalyzerConstants.standardAnalyzerString());
    DataWriter dataWriter = relationManager.getTableDataWriter(PROMED_SAMPLE_TABLE);
    dataWriter.open();
    for (Tuple tuple : fileTuples) {
        dataWriter.insertTuple(tuple);
    }
    dataWriter.close();
}
Also used : Scanner(java.util.Scanner) ArrayList(java.util.ArrayList) File(java.io.File) Tuple(edu.uci.ics.texera.api.tuple.Tuple) RelationManager(edu.uci.ics.texera.storage.RelationManager) DataWriter(edu.uci.ics.texera.storage.DataWriter)

Example 60 with Tuple

use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.

the class TwitterSample method createTwitterTable.

/**
 * A helper function to create a table and write twitter data into it.
 *
 * @param tableName
 * @param twitterJsonSourceOperator, a source operator that provides the input raw twitter JSON string tuples
 * @return
 */
public static int createTwitterTable(String tableName, ISourceOperator twitterJsonSourceOperator) {
    TwitterJsonConverter twitterJsonConverter = new TwitterJsonConverterPredicate("twitterJson").newOperator();
    TupleSink tupleSink = new TupleSinkPredicate(null, null).newOperator();
    twitterJsonConverter.setInputOperator(twitterJsonSourceOperator);
    tupleSink.setInputOperator(twitterJsonConverter);
    // open the workflow plan and get the output schema
    tupleSink.open();
    // create the table with TupleSink's output schema
    RelationManager relationManager = RelationManager.getInstance();
    if (relationManager.checkTableExistence(tableName)) {
        relationManager.deleteTable(tableName);
    }
    relationManager.createTable(tableName, Utils.getDefaultIndexDirectory().resolve(tableName), tupleSink.getOutputSchema(), LuceneAnalyzerConstants.standardAnalyzerString());
    DataWriter dataWriter = relationManager.getTableDataWriter(tableName);
    dataWriter.open();
    Tuple tuple;
    int counter = 0;
    while ((tuple = tupleSink.getNextTuple()) != null) {
        dataWriter.insertTuple(tuple);
        counter++;
    }
    dataWriter.close();
    tupleSink.close();
    return counter;
}
Also used : TupleSink(edu.uci.ics.texera.dataflow.sink.tuple.TupleSink) TwitterJsonConverterPredicate(edu.uci.ics.texera.dataflow.twitter.TwitterJsonConverterPredicate) TupleSinkPredicate(edu.uci.ics.texera.dataflow.sink.tuple.TupleSinkPredicate) TwitterJsonConverter(edu.uci.ics.texera.dataflow.twitter.TwitterJsonConverter) Tuple(edu.uci.ics.texera.api.tuple.Tuple) RelationManager(edu.uci.ics.texera.storage.RelationManager) DataWriter(edu.uci.ics.texera.storage.DataWriter)

Aggregations

Tuple (edu.uci.ics.texera.api.tuple.Tuple)332 ArrayList (java.util.ArrayList)191 Test (org.junit.Test)178 IField (edu.uci.ics.texera.api.field.IField)130 Schema (edu.uci.ics.texera.api.schema.Schema)126 Span (edu.uci.ics.texera.api.span.Span)100 StringField (edu.uci.ics.texera.api.field.StringField)96 Attribute (edu.uci.ics.texera.api.schema.Attribute)95 IntegerField (edu.uci.ics.texera.api.field.IntegerField)92 TextField (edu.uci.ics.texera.api.field.TextField)90 DoubleField (edu.uci.ics.texera.api.field.DoubleField)65 DateField (edu.uci.ics.texera.api.field.DateField)60 SimpleDateFormat (java.text.SimpleDateFormat)58 DataWriter (edu.uci.ics.texera.storage.DataWriter)33 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)30 ListField (edu.uci.ics.texera.api.field.ListField)28 TupleSourceOperator (edu.uci.ics.texera.dataflow.source.tuple.TupleSourceOperator)24 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)23 ScanBasedSourceOperator (edu.uci.ics.texera.dataflow.source.scan.ScanBasedSourceOperator)21 ScanSourcePredicate (edu.uci.ics.texera.dataflow.source.scan.ScanSourcePredicate)21