use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class RegexMatcherTest method testGetNextTupleStaffEmail.
@Test
public void testGetNextTupleStaffEmail() throws Exception {
String query = "^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$";
List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(STAFF_TABLE, query, Arrays.asList(RegexTestConstantStaff.EMAIL));
List<Tuple> expectedResults = new ArrayList<Tuple>();
// expected to match "k.bocanegra@uci.edu"
List<Tuple> data = RegexTestConstantStaff.getSampleStaffTuples();
Schema spanSchema = new Schema.Builder().add(RegexTestConstantStaff.SCHEMA_STAFF).add(RESULTS, AttributeType.LIST).build();
List<Span> spans = new ArrayList<Span>();
spans.add(new Span(RegexTestConstantStaff.EMAIL, 0, 19, query, "m.bocanegra@164.com"));
IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
List<IField> fields = new ArrayList<IField>(data.get(0).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
// expected to match "hwangl@ics.uci.edu"
spans.clear();
spans.add(new Span(RegexTestConstantStaff.EMAIL, 0, 18, query, "hwangk@ske.akb.edu"));
spanField = new ListField<Span>(new ArrayList<Span>(spans));
fields = new ArrayList<IField>(data.get(1).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class LabeledRegexMatcherTest method testIgnoreCaseLabeledRegex.
@Test
public void testIgnoreCaseLabeledRegex() throws Exception {
String query = "<NAME>";
String keywordQuery = "george lin lin";
List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(PEOPLE_TABLE, query, keywordQuery, Arrays.asList(TestConstants.FIRST_NAME), "name", false, Integer.MAX_VALUE, 0);
List<Tuple> expectedResults = new ArrayList<>();
// expected to match "george lin lin"
List<Tuple> data = TestConstants.getSamplePeopleTuples();
Schema spanSchema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS, AttributeType.LIST).build();
List<Span> spans = new ArrayList<>();
spans.add(new Span(TestConstants.FIRST_NAME, 0, 14, query, "george lin lin"));
IField spanField = new ListField<>(new ArrayList<>(spans));
List<IField> fields = new ArrayList<>(data.get(3).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
List<String> attributeNames = new ArrayList<>();
attributeNames.add(RESULTS);
Assert.assertTrue(TestUtils.attributeEquals(expectedResults, exactResults, attributeNames));
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class SampleExtraction method parsePromedHTML.
public static Tuple parsePromedHTML(String fileName, String content) {
try {
Document parsedDocument = Jsoup.parse(content);
String mainText = parsedDocument.getElementById("preview").text();
Tuple tuple = new Tuple(PromedSchema.PROMED_SCHEMA, new StringField(fileName), new TextField(mainText));
return tuple;
} catch (Exception e) {
return null;
}
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class SampleExtraction method writeSampleIndex.
public static void writeSampleIndex() throws Exception {
// parse the original file
File sourceFileFolder = new File(promedFilesDirectory);
ArrayList<Tuple> fileTuples = new ArrayList<>();
for (File htmlFile : sourceFileFolder.listFiles()) {
StringBuilder sb = new StringBuilder();
Scanner scanner = new Scanner(htmlFile);
while (scanner.hasNext()) {
sb.append(scanner.nextLine());
}
scanner.close();
Tuple tuple = parsePromedHTML(htmlFile.getName(), sb.toString());
if (tuple != null) {
fileTuples.add(tuple);
}
}
// write tuples into the table
RelationManager relationManager = RelationManager.getInstance();
relationManager.deleteTable(PROMED_SAMPLE_TABLE);
relationManager.createTable(PROMED_SAMPLE_TABLE, Paths.get(promedIndexDirectory), PromedSchema.PROMED_SCHEMA, LuceneAnalyzerConstants.standardAnalyzerString());
DataWriter dataWriter = relationManager.getTableDataWriter(PROMED_SAMPLE_TABLE);
dataWriter.open();
for (Tuple tuple : fileTuples) {
dataWriter.insertTuple(tuple);
}
dataWriter.close();
}
use of edu.uci.ics.texera.api.tuple.Tuple in project textdb by TextDB.
the class TwitterSample method createTwitterTable.
/**
* A helper function to create a table and write twitter data into it.
*
* @param tableName
* @param twitterJsonSourceOperator, a source operator that provides the input raw twitter JSON string tuples
* @return
*/
public static int createTwitterTable(String tableName, ISourceOperator twitterJsonSourceOperator) {
TwitterJsonConverter twitterJsonConverter = new TwitterJsonConverterPredicate("twitterJson").newOperator();
TupleSink tupleSink = new TupleSinkPredicate(null, null).newOperator();
twitterJsonConverter.setInputOperator(twitterJsonSourceOperator);
tupleSink.setInputOperator(twitterJsonConverter);
// open the workflow plan and get the output schema
tupleSink.open();
// create the table with TupleSink's output schema
RelationManager relationManager = RelationManager.getInstance();
if (relationManager.checkTableExistence(tableName)) {
relationManager.deleteTable(tableName);
}
relationManager.createTable(tableName, Utils.getDefaultIndexDirectory().resolve(tableName), tupleSink.getOutputSchema(), LuceneAnalyzerConstants.standardAnalyzerString());
DataWriter dataWriter = relationManager.getTableDataWriter(tableName);
dataWriter.open();
Tuple tuple;
int counter = 0;
while ((tuple = tupleSink.getNextTuple()) != null) {
dataWriter.insertTuple(tuple);
counter++;
}
dataWriter.close();
tupleSink.close();
return counter;
}
Aggregations