Search in sources :

Example 76 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
 * This method is called by the Join operator to perform the join on the
 * tuples passed.
 *
 * @return New Tuple containing the result of join operation.
 */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 77 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class SimilarityJoinPredicate method generateOutputSchema.

@Override
public Schema generateOutputSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataflowException {
    List<Attribute> outputAttributeList = new ArrayList<>();
    // add _ID field first
    outputAttributeList.add(SchemaConstants._ID_ATTRIBUTE);
    for (Attribute attr : innerOperatorSchema.getAttributes()) {
        String attrName = attr.getName();
        AttributeType attrType = attr.getType();
        // ignore _id, spanList, and payload
        if (attrName.equals(SchemaConstants._ID) || attrName.equals(SchemaConstants.SPAN_LIST) || attrName.equals(SchemaConstants.PAYLOAD)) {
            continue;
        }
        outputAttributeList.add(new Attribute(INNER_PREFIX + attrName, attrType));
    }
    for (Attribute attr : outerOperatorSchema.getAttributes()) {
        String attrName = attr.getName();
        AttributeType attrType = attr.getType();
        // ignore _id, spanList, and payload
        if (attrName.equals(SchemaConstants._ID) || attrName.equals(SchemaConstants.SPAN_LIST) || attrName.equals(SchemaConstants.PAYLOAD)) {
            continue;
        }
        outputAttributeList.add(new Attribute(OUTER_PREFIX + attrName, attrType));
    }
    // add spanList field
    outputAttributeList.add(SchemaConstants.SPAN_LIST_ATTRIBUTE);
    // add payload field if one of them contains payload
    if (innerOperatorSchema.containsAttribute(SchemaConstants.PAYLOAD) || outerOperatorSchema.containsAttribute(SchemaConstants.PAYLOAD)) {
        outputAttributeList.add(SchemaConstants.PAYLOAD_ATTRIBUTE);
    }
    return new Schema(outputAttributeList.stream().toArray(Attribute[]::new));
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema)

Example 78 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class ExcelSink method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    inputOperator.open();
    inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
    wb = new XSSFWorkbook();
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    fileName = df.format(new Date()) + ".xlsx";
    try {
        if (Files.notExists(excelIndexDirectory)) {
            Files.createDirectories(excelIndexDirectory);
        }
        fileOut = new FileOutputStream(excelIndexDirectory.resolve(fileName).toString());
    } catch (IOException e) {
        throw new DataflowException(e);
    }
    sheet = wb.createSheet("new sheet");
    Row row = sheet.createRow(0);
    List<String> attributeNames = outputSchema.getAttributeNames();
    for (int i = 0; i < attributeNames.size(); i++) {
        String attributeName = attributeNames.get(i);
        row.createCell(i).setCellValue(attributeName);
    }
    cursor = OPENED;
}
Also used : Date(java.util.Date) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) SimpleDateFormat(java.text.SimpleDateFormat) ArrayList(java.util.ArrayList) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) ISink(edu.uci.ics.texera.api.dataflow.ISink) Cell(org.apache.poi.ss.usermodel.Cell) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Path(java.nio.file.Path) DateFormat(java.text.DateFormat) IntegerField(edu.uci.ics.texera.api.field.IntegerField) Sheet(org.apache.poi.ss.usermodel.Sheet) Files(java.nio.file.Files) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) DoubleField(edu.uci.ics.texera.api.field.DoubleField) DateField(edu.uci.ics.texera.api.field.DateField) List(java.util.List) Workbook(org.apache.poi.ss.usermodel.Workbook) Utils(edu.uci.ics.texera.api.utils.Utils) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Row(org.apache.poi.ss.usermodel.Row) Schema(edu.uci.ics.texera.api.schema.Schema) IOException(java.io.IOException) Date(java.util.Date) SimpleDateFormat(java.text.SimpleDateFormat) DateFormat(java.text.DateFormat) FileOutputStream(java.io.FileOutputStream) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Row(org.apache.poi.ss.usermodel.Row) SimpleDateFormat(java.text.SimpleDateFormat)

Example 79 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class MysqlSink method open.

/**
 * Filter the input tuples to removie _id and list fields Setup JDBC
 * connection. Drop previous testTable and create new testTable based on
 * output schema
 */
@Override
public void open() throws TexeraException {
    if (cursor == OPENED) {
        return;
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
    // JDBC connection
    try {
        Class.forName("com.mysql.jdbc.Driver").newInstance();
        String url = "jdbc:mysql://" + predicate.getHost() + ":" + predicate.getPort() + "/" + predicate.getDatabase() + "?autoReconnect=true&useSSL=true";
        this.connection = DriverManager.getConnection(url, predicate.getUsername(), predicate.getPassword());
        statement = connection.createStatement();
        mysqlDropTable();
        mysqlCreateTable();
        cursor = OPENED;
    } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        throw new DataflowException("MysqlSink failed to connect to mysql database." + e.getMessage());
    }
}
Also used : Connection(java.sql.Connection) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PreparedStatement(java.sql.PreparedStatement) Collectors(java.util.stream.Collectors) DoubleField(edu.uci.ics.texera.api.field.DoubleField) ArrayList(java.util.ArrayList) DateField(edu.uci.ics.texera.api.field.DateField) SQLException(java.sql.SQLException) List(java.util.List) Stream(java.util.stream.Stream) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) ISink(edu.uci.ics.texera.api.dataflow.ISink) Statement(java.sql.Statement) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) DriverManager(java.sql.DriverManager) IntegerField(edu.uci.ics.texera.api.field.IntegerField) SQLException(java.sql.SQLException) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 80 with Attribute

use of edu.uci.ics.texera.api.schema.Attribute in project textdb by TextDB.

the class ExcelSinkTest method writeSampleExcelFile.

/**
 * Create two tuples, write into a excel file. Need to manually delete the generated file.
 * @throws ParseException
 */
@Test
public void writeSampleExcelFile() throws Exception {
    ArrayList<String> attributeNames = new ArrayList<>();
    attributeNames.add(TestConstants.FIRST_NAME);
    attributeNames.add(TestConstants.LAST_NAME);
    attributeNames.add(TestConstants.DESCRIPTION);
    // Prepare the expected result list
    List<Span> list = new ArrayList<>();
    Span span1 = new Span("firstName", 0, 5, "bruce", "bruce");
    Span span2 = new Span("lastnName", 0, 5, "jacki", "jacki");
    list.add(span1);
    list.add(span2);
    Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
    for (int count = 0; count < schemaAttributes.length - 1; count++) {
        schemaAttributes[count] = TestConstants.ATTRIBUTES_PEOPLE[count];
    }
    schemaAttributes[schemaAttributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE;
    IField[] fields1 = { new StringField("bruce"), new StringField("john Lee"), new IntegerField(46), new DoubleField(5.50), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("01-14-1970")), new TextField("Tall Angry"), new ListField<>(list) };
    IField[] fields2 = { new StringField("test"), new StringField("jackie chan"), new IntegerField(0), new DoubleField(6.0), new DateField(new SimpleDateFormat("MM-dd-yyyy").parse("09-18-1994")), new TextField("Angry Bird"), new ListField<>(list) };
    Tuple tuple1 = new Tuple(new Schema(schemaAttributes), fields1);
    Tuple tuple2 = new Tuple(new Schema(schemaAttributes), fields2);
    IOperator inputOperator = Mockito.mock(IOperator.class);
    Mockito.when(inputOperator.getOutputSchema()).thenReturn(new Schema(schemaAttributes)).thenReturn(null);
    Mockito.when(inputOperator.getNextTuple()).thenReturn(tuple1).thenReturn(tuple2).thenReturn(null);
    excelSink = new ExcelSink(new ExcelSinkPredicate());
    excelSink.setInputOperator(inputOperator);
    excelSink.open();
    excelSink.collectAllTuples();
    excelSink.close();
    Files.deleteIfExists(excelSink.getFilePath());
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) Schema(edu.uci.ics.texera.api.schema.Schema) ArrayList(java.util.ArrayList) IntegerField(edu.uci.ics.texera.api.field.IntegerField) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span) StringField(edu.uci.ics.texera.api.field.StringField) TextField(edu.uci.ics.texera.api.field.TextField) DateField(edu.uci.ics.texera.api.field.DateField) SimpleDateFormat(java.text.SimpleDateFormat) DoubleField(edu.uci.ics.texera.api.field.DoubleField) Tuple(edu.uci.ics.texera.api.tuple.Tuple) Test(org.junit.Test)

Aggregations

Attribute (edu.uci.ics.texera.api.schema.Attribute)98 Test (org.junit.Test)81 Tuple (edu.uci.ics.texera.api.tuple.Tuple)78 ArrayList (java.util.ArrayList)76 Schema (edu.uci.ics.texera.api.schema.Schema)75 IField (edu.uci.ics.texera.api.field.IField)60 StringField (edu.uci.ics.texera.api.field.StringField)56 TextField (edu.uci.ics.texera.api.field.TextField)56 IntegerField (edu.uci.ics.texera.api.field.IntegerField)54 DoubleField (edu.uci.ics.texera.api.field.DoubleField)53 Span (edu.uci.ics.texera.api.span.Span)51 DateField (edu.uci.ics.texera.api.field.DateField)50 SimpleDateFormat (java.text.SimpleDateFormat)47 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)28 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)9 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)8 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)6 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)6 List (java.util.List)6 Collectors (java.util.stream.Collectors)5