Search in sources :

Example 1 with IDField

use of edu.uci.ics.texera.api.field.IDField in project textdb by TextDB.

the class SimilarityJoinTest method test1.

/*
     * Tests the Similarity Join Predicate on two similar words:
     *   Donald J. Trump
     *   Donald Trump
     * Under the condition of similarity (NormalizedLevenshtein) > 0.8, these two words should match.
     *
     */
@Test
public void test1() throws TexeraException {
    JoinTestHelper.insertToTable(NEWS_TABLE_OUTER, JoinTestConstants.getNewsTuples().get(0));
    JoinTestHelper.insertToTable(NEWS_TABLE_INNER, JoinTestConstants.getNewsTuples().get(1));
    String trumpRegex = "[Dd]onald.{1,5}[Tt]rump";
    RegexMatcher regexMatcherInner = JoinTestHelper.getRegexMatcher(JoinTestHelper.NEWS_TABLE_INNER, trumpRegex, JoinTestConstants.NEWS_BODY);
    RegexMatcher regexMatcherOuter = JoinTestHelper.getRegexMatcher(JoinTestHelper.NEWS_TABLE_OUTER, trumpRegex, JoinTestConstants.NEWS_BODY);
    SimilarityJoinPredicate similarityJoinPredicate = new SimilarityJoinPredicate(JoinTestConstants.NEWS_BODY, 0.8);
    List<Tuple> results = JoinTestHelper.getJoinDistanceResults(regexMatcherInner, regexMatcherOuter, similarityJoinPredicate, Integer.MAX_VALUE, 0);
    Schema joinInputSchema = new Schema.Builder().add(JoinTestConstants.NEWS_SCHEMA).add(SchemaConstants.SPAN_LIST_ATTRIBUTE).build();
    Schema resultSchema = similarityJoinPredicate.generateOutputSchema(joinInputSchema, joinInputSchema);
    List<Span> resultSpanList = Arrays.asList(new Span("inner_" + JoinTestConstants.NEWS_BODY, 5, 20, trumpRegex, "Donald J. Trump", -1), new Span("outer_" + JoinTestConstants.NEWS_BODY, 18, 30, trumpRegex, "Donald Trump", -1));
    Tuple resultTuple = new Tuple(resultSchema, new IDField(UUID.randomUUID().toString()), new IntegerField(2), new TextField("Alternative Facts and the Costs of Trump-Branded Reality"), new TextField("When Donald J. Trump swore the presidential oath on Friday, he assumed " + "responsibility not only for the levers of government but also for one of " + "the United States’ most valuable assets, battered though it may be: its credibility. " + "The country’s sentimental reverence for truth and its jealously guarded press freedoms, " + "while never perfect, have been as important to its global standing as the strength of " + "its military and the reliability of its currency. It’s the bedrock of that " + "American exceptionalism we’ve heard so much about for so long."), new IntegerField(1), new TextField("UCI marchers protest as Trump begins his presidency"), new TextField("a few hours after Donald Trump was sworn in Friday as the nation’s 45th president, " + "a line of more than 100 UC Irvine faculty members and students took to the campus " + "in pouring rain to demonstrate their opposition to his policies on immigration and " + "other issues and urge other opponents to keep organizing during Trump’s presidency."), new ListField<>(resultSpanList));
    Assert.assertTrue(TestUtils.equals(Arrays.asList(resultTuple), results));
}
Also used : IDField(edu.uci.ics.texera.api.field.IDField) SimilarityJoinPredicate(edu.uci.ics.texera.dataflow.join.SimilarityJoinPredicate) Schema(edu.uci.ics.texera.api.schema.Schema) IntegerField(edu.uci.ics.texera.api.field.IntegerField) Span(edu.uci.ics.texera.api.span.Span) TextField(edu.uci.ics.texera.api.field.TextField) RegexMatcher(edu.uci.ics.texera.dataflow.regexmatcher.RegexMatcher) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 2 with IDField

use of edu.uci.ics.texera.api.field.IDField in project textdb by TextDB.

the class SimilarityJoinTest method test3.

/*
     * Tests the Similarity Join Predicate on two similar words:
     *   Galaxy S8
     *   Galaxy Note 7
     * Under the condition of similarity (NormalizedLevenshtein) > 0.5, these two words should match.
     *
     */
@Test
public void test3() throws TexeraException {
    JoinTestHelper.insertToTable(NEWS_TABLE_OUTER, JoinTestConstants.getNewsTuples().get(2));
    JoinTestHelper.insertToTable(NEWS_TABLE_INNER, JoinTestConstants.getNewsTuples().get(3));
    String phoneRegex = "[Gg]alaxy.{1,6}\\d";
    RegexMatcher regexMatcherInner = JoinTestHelper.getRegexMatcher(JoinTestHelper.NEWS_TABLE_INNER, phoneRegex, JoinTestConstants.NEWS_BODY);
    RegexMatcher regexMatcherOuter = JoinTestHelper.getRegexMatcher(JoinTestHelper.NEWS_TABLE_OUTER, phoneRegex, JoinTestConstants.NEWS_BODY);
    SimilarityJoinPredicate similarityJoinPredicate = new SimilarityJoinPredicate(JoinTestConstants.NEWS_BODY, 0.5);
    List<Tuple> results = JoinTestHelper.getJoinDistanceResults(regexMatcherInner, regexMatcherOuter, similarityJoinPredicate, Integer.MAX_VALUE, 0);
    Schema joinInputSchema = new Schema.Builder().add(JoinTestConstants.NEWS_SCHEMA).add(SchemaConstants.SPAN_LIST_ATTRIBUTE).build();
    Schema resultSchema = similarityJoinPredicate.generateOutputSchema(joinInputSchema, joinInputSchema);
    List<Span> resultSpanList = Arrays.asList(new Span("inner_" + JoinTestConstants.NEWS_BODY, 327, 336, phoneRegex, "Galaxy S8", -1), new Span("outer_" + JoinTestConstants.NEWS_BODY, 21, 34, phoneRegex, "Galaxy Note 7", -1));
    Tuple resultTuple = new Tuple(resultSchema, new IDField(UUID.randomUUID().toString()), new IntegerField(4), new TextField("This is how Samsung plans to prevent future phones from catching fire"), new TextField("Samsung said that it has implemented a new eight-step testing process for " + "its lithium ion batteries, and that it’s forming a battery advisory board as well, " + "comprised of academics from Cambridge, Berkeley, and Stanford. " + "Note, this is for all lithium ion batteries in Samsung products, " + "not just Note phablets or the anticipated Galaxy S8 phone."), new IntegerField(3), new TextField("Samsung Explains Note 7 Battery Explosions, And Turns Crisis Into Opportunity"), new TextField("Samsung launched the Galaxy Note 7 to record preorders and sales in August, " + "but the rosy start soon turned sour. Samsung had to initiate a recall in September of " + "the first version of the Note 7 due to faulty batteries that overheated and exploded. " + "By October it had to recall over 2 million devices and discontinue the product. " + "It’s estimated that the recall will cost Samsung $5.3 billion."), new ListField<>(resultSpanList));
    Assert.assertTrue(TestUtils.equals(Arrays.asList(resultTuple), results));
}
Also used : IDField(edu.uci.ics.texera.api.field.IDField) SimilarityJoinPredicate(edu.uci.ics.texera.dataflow.join.SimilarityJoinPredicate) Schema(edu.uci.ics.texera.api.schema.Schema) IntegerField(edu.uci.ics.texera.api.field.IntegerField) Span(edu.uci.ics.texera.api.span.Span) TextField(edu.uci.ics.texera.api.field.TextField) RegexMatcher(edu.uci.ics.texera.dataflow.regexmatcher.RegexMatcher) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Example 3 with IDField

use of edu.uci.ics.texera.api.field.IDField in project textdb by TextDB.

the class PlanStore method addPlan.

/**
 * Adds a Logical Plan JSON to the plan store.
 *
 * @param planName, the name of the plan.
 * @param description, the description of the plan.
 * @param logicalPlanJson, the logical plan JSON string
 * @Return IDField, the id field of the plan stored.
 * @throws TexeraException, when there are null fields or the given name is invalid or there is an existing plan with same name.
 */
public IDField addPlan(String planName, String description, String logicalPlanJson) throws TexeraException {
    if (planName == null || description == null || logicalPlanJson == null) {
        throw new TexeraException("Arguments cannot be null when adding a plan");
    }
    if (!PlanStoreConstants.VALID_PLAN_NAME.matcher(planName).find()) {
        throw new TexeraException("Plan name is not valid. It can only contain alphanumeric characters, " + "underscore, and hyphen.");
    }
    if (getPlan(planName) != null) {
        throw new TexeraException("A plan with the same name already exists");
    }
    try {
        // Converting the JSON String to a JSON Node to minimize space usage and to check validity of JSON string
        ObjectMapper objectMapper = new ObjectMapper();
        JsonNode jsonNode = objectMapper.readValue(logicalPlanJson, JsonNode.class);
        logicalPlanJson = objectMapper.writeValueAsString(jsonNode);
    } catch (IOException e) {
        throw new StorageException("logical plan json is an invalid json string: " + logicalPlanJson);
    }
    Tuple tuple = new Tuple(PlanStoreConstants.SCHEMA_PLAN, new StringField(planName), new StringField(description), new StringField(logicalPlanJson));
    DataWriter dataWriter = relationManager.getTableDataWriter(PlanStoreConstants.TABLE_NAME);
    dataWriter.open();
    IDField id = dataWriter.insertTuple(tuple);
    dataWriter.close();
    return id;
}
Also used : IDField(edu.uci.ics.texera.api.field.IDField) StringField(edu.uci.ics.texera.api.field.StringField) JsonNode(com.fasterxml.jackson.databind.JsonNode) IOException(java.io.IOException) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) StorageException(edu.uci.ics.texera.api.exception.StorageException) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Tuple(edu.uci.ics.texera.api.tuple.Tuple) DataWriter(edu.uci.ics.texera.storage.DataWriter)

Example 4 with IDField

use of edu.uci.ics.texera.api.field.IDField in project textdb by TextDB.

the class PlanStore method updatePlanInternal.

/**
 * Updates both plan description and plan json of a plan with the given plan name.
 * If description is null, it will not update plan description.
 * If plan json is NULL, it will not update the plan's JSON file.
 *
 * @param planName, the name of the plan.
 * @param description, the new description of the plan.
 * @param logicalPlanJson, the new plan json string.
 * @throws TexeraException
 */
private void updatePlanInternal(String planName, String description, String logicalPlanJson) throws TexeraException {
    Tuple existingPlan = getPlan(planName);
    if (existingPlan == null) {
        return;
    }
    // Checking if an updated description or logical plan JSON string has been provided
    if (description == null && logicalPlanJson == null) {
        return;
    }
    // Checking if the logical plan JSON string needs to be updated
    if (logicalPlanJson != null) {
        // Compressing and checking the validity of the logical plan JSON string
        try {
            ObjectMapper objectMapper = new ObjectMapper();
            JsonNode jsonNode = objectMapper.readValue(logicalPlanJson, JsonNode.class);
            logicalPlanJson = objectMapper.writeValueAsString(jsonNode);
        } catch (IOException e) {
            throw new StorageException("logical plan json is an invalid json string: " + logicalPlanJson);
        }
    }
    // Getting the fields in order for performing the update
    IDField idField = (IDField) existingPlan.getField(SchemaConstants._ID);
    IField descriptionField = description != null ? new StringField(description) : existingPlan.getField(PlanStoreConstants.DESCRIPTION);
    IField logicalPlanJsonField = logicalPlanJson != null ? new StringField(logicalPlanJson) : existingPlan.getField(PlanStoreConstants.LOGICAL_PLAN_JSON);
    // Creating a tuple out of all the fields
    Tuple newTuple = new Tuple(PlanStoreConstants.SCHEMA_PLAN, new StringField(planName), descriptionField, logicalPlanJsonField);
    // Writing the updated tuple
    DataWriter dataWriter = relationManager.getTableDataWriter(PlanStoreConstants.TABLE_NAME);
    dataWriter.open();
    dataWriter.updateTuple(newTuple, idField);
    dataWriter.close();
}
Also used : IDField(edu.uci.ics.texera.api.field.IDField) StringField(edu.uci.ics.texera.api.field.StringField) JsonNode(com.fasterxml.jackson.databind.JsonNode) IOException(java.io.IOException) IField(edu.uci.ics.texera.api.field.IField) StorageException(edu.uci.ics.texera.api.exception.StorageException) Tuple(edu.uci.ics.texera.api.tuple.Tuple) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) DataWriter(edu.uci.ics.texera.storage.DataWriter)

Example 5 with IDField

use of edu.uci.ics.texera.api.field.IDField in project textdb by TextDB.

the class NlpSplitTest method test2.

@Test
public void test2() throws TexeraException, ParseException {
    TupleSourceOperator tupleSource = new TupleSourceOperator(NlpSplitTestConstants.getOneToManyTestTuple(), NlpSplitTestConstants.SPLIT_SCHEMA);
    NlpSplitOperator sentence_list = new NlpSplitOperator(new NlpSplitPredicate(NLPOutputType.ONE_TO_MANY, NlpSplitTestConstants.TEXT, PropertyNameConstants.NLP_OUTPUT_TYPE));
    TupleSink tupleSink = new TupleSink();
    sentence_list.setInputOperator(tupleSource);
    tupleSink.setInputOperator(sentence_list);
    tupleSink.open();
    List<Tuple> results = tupleSink.collectAllTuples();
    tupleSink.close();
    Assert.assertTrue(TestUtils.equals(NlpSplitTestConstants.getOneToManyResultTuple(), results));
    Set<IDField> compset = new HashSet<IDField>();
    for (Tuple result : results) {
        Assert.assertFalse(compset.contains(result.getField(SchemaConstants._ID)));
        compset.add(result.getField(SchemaConstants._ID));
    }
}
Also used : TupleSink(edu.uci.ics.texera.dataflow.sink.tuple.TupleSink) IDField(edu.uci.ics.texera.api.field.IDField) TupleSourceOperator(edu.uci.ics.texera.dataflow.source.tuple.TupleSourceOperator) Tuple(edu.uci.ics.texera.api.tuple.Tuple) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

IDField (edu.uci.ics.texera.api.field.IDField)11 Tuple (edu.uci.ics.texera.api.tuple.Tuple)10 Test (org.junit.Test)6 StringField (edu.uci.ics.texera.api.field.StringField)5 Schema (edu.uci.ics.texera.api.schema.Schema)5 StorageException (edu.uci.ics.texera.api.exception.StorageException)3 Attribute (edu.uci.ics.texera.api.schema.Attribute)3 Span (edu.uci.ics.texera.api.span.Span)3 DataWriter (edu.uci.ics.texera.storage.DataWriter)3 IOException (java.io.IOException)3 JsonNode (com.fasterxml.jackson.databind.JsonNode)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 IField (edu.uci.ics.texera.api.field.IField)2 IntegerField (edu.uci.ics.texera.api.field.IntegerField)2 TextField (edu.uci.ics.texera.api.field.TextField)2 SimilarityJoinPredicate (edu.uci.ics.texera.dataflow.join.SimilarityJoinPredicate)2 RegexMatcher (edu.uci.ics.texera.dataflow.regexmatcher.RegexMatcher)2 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)1 ListField (edu.uci.ics.texera.api.field.ListField)1 TupleSink (edu.uci.ics.texera.dataflow.sink.tuple.TupleSink)1