Search in sources :

Example 1 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class JoinDistancePredicate method generateIntersectionSchema.

/**
 * Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
 * The attributes have to be exactly the same (name and type) to be intersected.
 *
 * InnerOperator's attributes and outerOperator's attributes must:
 * both contain the attributes to be joined.
 * both contain "_ID" attribute.
 * both contain "spanList" attribute.
 *
 * @return outputSchema
 */
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataflowException {
    List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
    List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
    List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
    Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
    // check if output schema contain necessary attributes
    if (intersectionSchema.getAttributes().isEmpty()) {
        throw new DataflowException("inner operator and outer operator don't share any common attributes");
    } else if (!intersectionSchema.containsAttribute(this.joinAttributeName)) {
        throw new DataflowException("inner operator or outer operator doesn't contain join attribute");
    } else if (!intersectionSchema.containsAttribute(SchemaConstants._ID)) {
        throw new DataflowException("inner operator or outer operator doesn't contain _ID attribute");
    } else if (!intersectionSchema.containsAttribute(SchemaConstants.SPAN_LIST)) {
        throw new DataflowException("inner operator or outer operator doesn't contain spanList attribute");
    }
    // check if join attribute is TEXT or STRING
    AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getType();
    if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
        throw new DataflowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
    }
    return intersectionSchema;
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) ImmutableMap(com.google.common.collect.ImmutableMap) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) OperatorGroupConstants(edu.uci.ics.texera.dataflow.common.OperatorGroupConstants) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) Map(java.util.Map) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 2 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
 * This method is called by the Join operator to perform the join on the
 * tuples passed.
 *
 * @return New Tuple containing the result of join operation.
 */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) ImmutableMap(com.google.common.collect.ImmutableMap) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) OperatorGroupConstants(edu.uci.ics.texera.dataflow.common.OperatorGroupConstants) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) Map(java.util.Map) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 3 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class FileSourceOperator method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    cursor = OPENED;
    try {
        List<String> columnNames = null;
        if (predicate.getFileFormat() != null && predicate.getFileFormat() == FileSourcePredicate.FileFormat.CSV_WITH_HEADER) {
            Optional<String> header = Files.lines(pathList.get(0)).findFirst();
            if (header.isPresent()) {
                columnNames = Arrays.stream(header.get().split(predicate.getColumnDelimiter())).collect(Collectors.toList());
            }
        }
        if (predicate.getColumnDelimiter() != null) {
            Optional<String> firstLine = Files.lines(pathList.get(0)).findFirst();
            if (firstLine.isPresent()) {
                columnNames = IntStream.range(0, firstLine.get().split(predicate.getColumnDelimiter()).length).map(i -> i + 1).mapToObj(i -> "c" + i).collect(Collectors.toList());
            }
        }
        if (columnNames == null) {
            columnNames = Collections.singletonList("c1");
        }
        List<Attribute> attributes = columnNames.stream().map(name -> new Attribute(name, AttributeType.TEXT)).collect(Collectors.toList());
        this.outputSchema = new Schema.Builder().add(SchemaConstants._ID_ATTRIBUTE).add(attributes).build();
    } catch (IOException e) {
        throw new DataflowException(e);
    }
}
Also used : IntStream(java.util.stream.IntStream) java.util(java.util) Verify(com.google.common.base.Verify) Files(java.nio.file.Files) Tuple(edu.uci.ics.texera.api.tuple.Tuple) QueryContext(edu.uci.ics.texera.dataflow.plangen.QueryContext) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) FileManager(edu.uci.ics.texera.dataflow.resource.file.FileManager) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) TextField(edu.uci.ics.texera.api.field.TextField) Paths(java.nio.file.Paths) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) ISourceOperator(edu.uci.ics.texera.api.dataflow.ISourceOperator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) IDField(edu.uci.ics.texera.api.field.IDField) Path(java.nio.file.Path) Attribute(edu.uci.ics.texera.api.schema.Attribute) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException)

Example 4 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class CSVSink method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    inputOperator.open();
    inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    fileName = df.format(new Date()) + ".csv";
    File file = new File(csvIndexDirectory.resolve(fileName).toString());
    try {
        if (Files.notExists(csvIndexDirectory)) {
            Files.createDirectories(csvIndexDirectory);
        }
        csvWriter = new CSVWriter(new FileWriter(file));
    } catch (IOException e) {
        throw new DataflowException(e);
    }
    // write csv headers
    List<String> attributeNames = outputSchema.getAttributeNames();
    csvWriter.writeNext(attributeNames.stream().toArray(String[]::new));
    cursor = OPENED;
}
Also used : Files(java.nio.file.Files) Date(java.util.Date) CSVWriter(au.com.bytecode.opencsv.CSVWriter) Tuple(edu.uci.ics.texera.api.tuple.Tuple) FileWriter(java.io.FileWriter) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) SimpleDateFormat(java.text.SimpleDateFormat) IOException(java.io.IOException) File(java.io.File) ArrayList(java.util.ArrayList) List(java.util.List) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) Utils(edu.uci.ics.texera.api.utils.Utils) ISink(edu.uci.ics.texera.api.dataflow.ISink) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Path(java.nio.file.Path) DateFormat(java.text.DateFormat) Schema(edu.uci.ics.texera.api.schema.Schema) SimpleDateFormat(java.text.SimpleDateFormat) DateFormat(java.text.DateFormat) FileWriter(java.io.FileWriter) CSVWriter(au.com.bytecode.opencsv.CSVWriter) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException) SimpleDateFormat(java.text.SimpleDateFormat) File(java.io.File) Date(java.util.Date)

Example 5 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class ExcelSink method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    inputOperator.open();
    inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
    wb = new XSSFWorkbook();
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    fileName = df.format(new Date()) + ".xlsx";
    try {
        if (Files.notExists(excelIndexDirectory)) {
            Files.createDirectories(excelIndexDirectory);
        }
        fileOut = new FileOutputStream(excelIndexDirectory.resolve(fileName).toString());
    } catch (IOException e) {
        throw new DataflowException(e);
    }
    sheet = wb.createSheet("new sheet");
    Row row = sheet.createRow(0);
    List<String> attributeNames = outputSchema.getAttributeNames();
    for (int i = 0; i < attributeNames.size(); i++) {
        String attributeName = attributeNames.get(i);
        row.createCell(i).setCellValue(attributeName);
    }
    cursor = OPENED;
}
Also used : Date(java.util.Date) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) SimpleDateFormat(java.text.SimpleDateFormat) ArrayList(java.util.ArrayList) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) ISink(edu.uci.ics.texera.api.dataflow.ISink) Cell(org.apache.poi.ss.usermodel.Cell) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Path(java.nio.file.Path) DateFormat(java.text.DateFormat) IntegerField(edu.uci.ics.texera.api.field.IntegerField) Sheet(org.apache.poi.ss.usermodel.Sheet) Files(java.nio.file.Files) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) DoubleField(edu.uci.ics.texera.api.field.DoubleField) DateField(edu.uci.ics.texera.api.field.DateField) List(java.util.List) Workbook(org.apache.poi.ss.usermodel.Workbook) Utils(edu.uci.ics.texera.api.utils.Utils) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Row(org.apache.poi.ss.usermodel.Row) Schema(edu.uci.ics.texera.api.schema.Schema) IOException(java.io.IOException) Date(java.util.Date) SimpleDateFormat(java.text.SimpleDateFormat) DateFormat(java.text.DateFormat) FileOutputStream(java.io.FileOutputStream) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Row(org.apache.poi.ss.usermodel.Row) SimpleDateFormat(java.text.SimpleDateFormat)

Aggregations

SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)7 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)7 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)7 Attribute (edu.uci.ics.texera.api.schema.Attribute)7 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)7 Schema (edu.uci.ics.texera.api.schema.Schema)7 IField (edu.uci.ics.texera.api.field.IField)6 ArrayList (java.util.ArrayList)6 List (java.util.List)6 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)5 Tuple (edu.uci.ics.texera.api.tuple.Tuple)5 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)4 ISink (edu.uci.ics.texera.api.dataflow.ISink)4 IOException (java.io.IOException)4 Files (java.nio.file.Files)4 Path (java.nio.file.Path)4 Collectors (java.util.stream.Collectors)4 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)3 DateField (edu.uci.ics.texera.api.field.DateField)3 DoubleField (edu.uci.ics.texera.api.field.DoubleField)3