Search in sources :

Example 1 with SchemaConstants

use of edu.uci.ics.textdb.api.constants.SchemaConstants in project textdb by TextDB.

the class ExcelSink method open.

@Override
public void open() throws TextDBException {
    if (cursor != CLOSED) {
        return;
    }
    inputOperator.open();
    inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getAttributeName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getAttributeName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getAttributeType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
    wb = new XSSFWorkbook();
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    fileName = df.format(new Date()) + ".xlsx";
    try {
        if (Files.notExists(Paths.get(excelIndexDirectory))) {
            Files.createDirectories(Paths.get(excelIndexDirectory));
        }
        fileOut = new FileOutputStream(Paths.get(excelIndexDirectory, fileName).toString());
    } catch (IOException e) {
        throw new DataFlowException(e);
    }
    sheet = wb.createSheet("new sheet");
    Row row = sheet.createRow(0);
    List<String> attributeNames = outputSchema.getAttributeNames();
    for (int i = 0; i < attributeNames.size(); i++) {
        String attributeName = attributeNames.get(i);
        row.createCell(i).setCellValue(attributeName);
    }
    cursor = OPENED;
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) DoubleField(edu.uci.ics.textdb.api.field.DoubleField) DateField(edu.uci.ics.textdb.api.field.DateField) Date(java.util.Date) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) SimpleDateFormat(java.text.SimpleDateFormat) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) IntegerField(edu.uci.ics.textdb.api.field.IntegerField) ISink(edu.uci.ics.textdb.api.dataflow.ISink) TextDBException(edu.uci.ics.textdb.api.exception.TextDBException) Cell(org.apache.poi.ss.usermodel.Cell) DateFormat(java.text.DateFormat) Tuple(edu.uci.ics.textdb.api.tuple.Tuple) IOperator(edu.uci.ics.textdb.api.dataflow.IOperator) Sheet(org.apache.poi.ss.usermodel.Sheet) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Files(java.nio.file.Files) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) Utils(edu.uci.ics.textdb.api.utils.Utils) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) Workbook(org.apache.poi.ss.usermodel.Workbook) Paths(java.nio.file.Paths) IField(edu.uci.ics.textdb.api.field.IField) Row(org.apache.poi.ss.usermodel.Row) Schema(edu.uci.ics.textdb.api.schema.Schema) IOException(java.io.IOException) Date(java.util.Date) SimpleDateFormat(java.text.SimpleDateFormat) DateFormat(java.text.DateFormat) FileOutputStream(java.io.FileOutputStream) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Row(org.apache.poi.ss.usermodel.Row) SimpleDateFormat(java.text.SimpleDateFormat)

Example 2 with SchemaConstants

use of edu.uci.ics.textdb.api.constants.SchemaConstants in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
     * This method is called by the Join operator to perform the join on the 
     * tuples passed.
     * 
     * @return New Tuple containing the result of join operation.
     */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getAttributeName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) PredicateBase(edu.uci.ics.textdb.exp.common.PredicateBase) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) edu.uci.ics.textdb.api.tuple(edu.uci.ics.textdb.api.tuple) Span(edu.uci.ics.textdb.api.span.Span) PropertyNameConstants(edu.uci.ics.textdb.exp.common.PropertyNameConstants) IOperator(edu.uci.ics.textdb.api.dataflow.IOperator) Attribute(edu.uci.ics.textdb.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.textdb.api.field.IField) Span(edu.uci.ics.textdb.api.span.Span)

Example 3 with SchemaConstants

use of edu.uci.ics.textdb.api.constants.SchemaConstants in project textdb by TextDB.

the class JoinDistancePredicate method generateIntersectionSchema.

/**
     * Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
     * The attributes have to be exactly the same (name and type) to be intersected.
     * 
     * InnerOperator's attributes and outerOperator's attributes must:
     * both contain the attributes to be joined.
     * both contain "_ID" attribute.
     * both contain "spanList" attribute.
     * 
     * @return outputSchema
     */
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataFlowException {
    List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
    List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
    List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
    Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
    // check if output schema contain necessary attributes
    if (intersectionSchema.getAttributes().isEmpty()) {
        throw new DataFlowException("inner operator and outer operator don't share any common attributes");
    } else if (intersectionSchema.getAttribute(this.joinAttributeName) == null) {
        throw new DataFlowException("inner operator or outer operator doesn't contain join attribute");
    } else if (intersectionSchema.getAttribute(SchemaConstants._ID) == null) {
        throw new DataFlowException("inner operator or outer operator doesn't contain _ID attribute");
    } else if (intersectionSchema.getAttribute(SchemaConstants.SPAN_LIST) == null) {
        throw new DataFlowException("inner operator or outer operator doesn't contain spanList attribute");
    }
    // check if join attribute is TEXT or STRING
    AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getAttributeType();
    if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
        throw new DataFlowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
    }
    return intersectionSchema;
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) PredicateBase(edu.uci.ics.textdb.exp.common.PredicateBase) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) edu.uci.ics.textdb.api.tuple(edu.uci.ics.textdb.api.tuple) Span(edu.uci.ics.textdb.api.span.Span) PropertyNameConstants(edu.uci.ics.textdb.exp.common.PropertyNameConstants) IOperator(edu.uci.ics.textdb.api.dataflow.IOperator) Attribute(edu.uci.ics.textdb.api.schema.Attribute) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) Schema(edu.uci.ics.textdb.api.schema.Schema) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException)

Aggregations

SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)3 IOperator (edu.uci.ics.textdb.api.dataflow.IOperator)3 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)3 IField (edu.uci.ics.textdb.api.field.IField)3 Attribute (edu.uci.ics.textdb.api.schema.Attribute)3 AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)3 Schema (edu.uci.ics.textdb.api.schema.Schema)3 ArrayList (java.util.ArrayList)3 List (java.util.List)3 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)2 ListField (edu.uci.ics.textdb.api.field.ListField)2 Span (edu.uci.ics.textdb.api.span.Span)2 edu.uci.ics.textdb.api.tuple (edu.uci.ics.textdb.api.tuple)2 PredicateBase (edu.uci.ics.textdb.exp.common.PredicateBase)2 PropertyNameConstants (edu.uci.ics.textdb.exp.common.PropertyNameConstants)2 Iterator (java.util.Iterator)2 Collectors (java.util.stream.Collectors)2 ISink (edu.uci.ics.textdb.api.dataflow.ISink)1 TextDBException (edu.uci.ics.textdb.api.exception.TextDBException)1