Search in sources :

Example 6 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class JoinDistancePredicate method joinTuples.

/**
 * This method is called by the Join operator to perform the join on the
 * tuples passed.
 *
 * @return New Tuple containing the result of join operation.
 */
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
    List<Span> newJoinSpanList = new ArrayList<>();
    /*
	     * We expect the values of all fields to be the same for innerTuple and outerTuple.
	     * We only checks _ID field, and field to be joined, since they are crucial to join operator.
	     * For other fields, we use the value from innerTuple.
	     * check if the _ID fields are the same
	     */
    if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
        return null;
    }
    // check if the fields to be joined are the same
    if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
        return null;
    }
    /*
	     * If either/both tuples have no span information, return null.
	     * Check using try/catch if both the tuples have span information.
	     * If not return null; so we can process next tuple.
	     */
    ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
    ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerSpanList = null;
    List<Span> outerSpanList = null;
    // ListField
    if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
        innerSpanList = spanFieldOfInnerTuple.getValue();
    }
    if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
        outerSpanList = spanFieldOfOuterTuple.getValue();
    }
    Iterator<Span> outerSpanIter = outerSpanList.iterator();
    // the ones specified in the JoinPredicate during "sort merge"?)
    while (outerSpanIter.hasNext()) {
        Span outerSpan = outerSpanIter.next();
        // If not return null.
        if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
            continue;
        }
        Iterator<Span> innerSpanIter = innerSpanList.iterator();
        while (innerSpanIter.hasNext()) {
            Span innerSpan = innerSpanIter.next();
            if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
                continue;
            }
            Integer threshold = this.getThreshold();
            if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
                Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
                Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
                String attributeName = this.joinAttributeName;
                String fieldValue = (String) innerTuple.getField(attributeName).getValue();
                String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
                String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
                Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
                newJoinSpanList.add(newSpan);
            }
        }
    }
    if (newJoinSpanList.isEmpty()) {
        return null;
    }
    // create output fields based on innerTuple's value
    List<Attribute> outputAttrList = outputSchema.getAttributes();
    List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
    outputFields.add(new ListField<>(newJoinSpanList));
    return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) ImmutableMap(com.google.common.collect.ImmutableMap) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) OperatorGroupConstants(edu.uci.ics.texera.dataflow.common.OperatorGroupConstants) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) Map(java.util.Map) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) ArrayList(java.util.ArrayList) IField(edu.uci.ics.texera.api.field.IField) Span(edu.uci.ics.texera.api.span.Span)

Example 7 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class FileSourceOperator method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    cursor = OPENED;
    try {
        List<String> columnNames = null;
        if (predicate.getFileFormat() != null && predicate.getFileFormat() == FileSourcePredicate.FileFormat.CSV_WITH_HEADER) {
            Optional<String> header = Files.lines(pathList.get(0)).findFirst();
            if (header.isPresent()) {
                columnNames = Arrays.stream(header.get().split(predicate.getColumnDelimiter())).collect(Collectors.toList());
            }
        }
        if (predicate.getColumnDelimiter() != null) {
            Optional<String> firstLine = Files.lines(pathList.get(0)).findFirst();
            if (firstLine.isPresent()) {
                columnNames = IntStream.range(0, firstLine.get().split(predicate.getColumnDelimiter()).length).map(i -> i + 1).mapToObj(i -> "c" + i).collect(Collectors.toList());
            }
        }
        if (columnNames == null) {
            columnNames = Collections.singletonList("c1");
        }
        List<Attribute> attributes = columnNames.stream().map(name -> new Attribute(name, AttributeType.TEXT)).collect(Collectors.toList());
        this.outputSchema = new Schema.Builder().add(SchemaConstants._ID_ATTRIBUTE).add(attributes).build();
    } catch (IOException e) {
        throw new DataflowException(e);
    }
}
Also used : IntStream(java.util.stream.IntStream) java.util(java.util) Verify(com.google.common.base.Verify) Files(java.nio.file.Files) Tuple(edu.uci.ics.texera.api.tuple.Tuple) QueryContext(edu.uci.ics.texera.dataflow.plangen.QueryContext) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) FileManager(edu.uci.ics.texera.dataflow.resource.file.FileManager) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IField(edu.uci.ics.texera.api.field.IField) TextField(edu.uci.ics.texera.api.field.TextField) Paths(java.nio.file.Paths) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) ISourceOperator(edu.uci.ics.texera.api.dataflow.ISourceOperator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) IDField(edu.uci.ics.texera.api.field.IDField) Path(java.nio.file.Path) Attribute(edu.uci.ics.texera.api.schema.Attribute) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException)

Example 8 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class JSONSink method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    inputOperator.open();
    inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).toArray(Attribute[]::new));
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    fileName = df.format(new Date()) + ".json";
    mapper = new ObjectMapper();
    File file = new File(jsonIndexDirectory.resolve(fileName).toString());
    try {
        if (Files.notExists(jsonIndexDirectory)) {
            Files.createDirectories(jsonIndexDirectory);
        }
        // creates json generator factory for writing to file
        jsonGenerator = mapper.getFactory().createGenerator(file, JsonEncoding.UTF8);
        jsonGenerator.writeStartArray();
    } catch (IOException e) {
        throw new DataflowException(e);
    }
    cursor = OPENED;
}
Also used : ListField(edu.uci.ics.texera.api.field.ListField) Date(java.util.Date) JsonGenerator(com.fasterxml.jackson.core.JsonGenerator) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) SimpleDateFormat(java.text.SimpleDateFormat) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) JsonEncoding(com.fasterxml.jackson.core.JsonEncoding) ISink(edu.uci.ics.texera.api.dataflow.ISink) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Path(java.nio.file.Path) DateFormat(java.text.DateFormat) IntegerField(edu.uci.ics.texera.api.field.IntegerField) Files(java.nio.file.Files) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) IOException(java.io.IOException) File(java.io.File) DoubleField(edu.uci.ics.texera.api.field.DoubleField) DateField(edu.uci.ics.texera.api.field.DateField) List(java.util.List) Utils(edu.uci.ics.texera.api.utils.Utils) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) SimpleDateFormat(java.text.SimpleDateFormat) DateFormat(java.text.DateFormat) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException) SimpleDateFormat(java.text.SimpleDateFormat) File(java.io.File) Date(java.util.Date) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 9 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class MysqlSink method open.

/**
 * Filter the input tuples to removie _id and list fields Setup JDBC
 * connection. Drop previous testTable and create new testTable based on
 * output schema
 */
@Override
public void open() throws TexeraException {
    if (cursor == OPENED) {
        return;
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
    // JDBC connection
    try {
        Class.forName("com.mysql.jdbc.Driver").newInstance();
        String url = "jdbc:mysql://" + predicate.getHost() + ":" + predicate.getPort() + "/" + predicate.getDatabase() + "?autoReconnect=true&useSSL=true";
        this.connection = DriverManager.getConnection(url, predicate.getUsername(), predicate.getPassword());
        statement = connection.createStatement();
        mysqlDropTable();
        mysqlCreateTable();
        cursor = OPENED;
    } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        throw new DataflowException("MysqlSink failed to connect to mysql database." + e.getMessage());
    }
}
Also used : Connection(java.sql.Connection) Tuple(edu.uci.ics.texera.api.tuple.Tuple) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PreparedStatement(java.sql.PreparedStatement) Collectors(java.util.stream.Collectors) DoubleField(edu.uci.ics.texera.api.field.DoubleField) ArrayList(java.util.ArrayList) DateField(edu.uci.ics.texera.api.field.DateField) SQLException(java.sql.SQLException) List(java.util.List) Stream(java.util.stream.Stream) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) ISink(edu.uci.ics.texera.api.dataflow.ISink) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) Statement(java.sql.Statement) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) DriverManager(java.sql.DriverManager) IntegerField(edu.uci.ics.texera.api.field.IntegerField) SQLException(java.sql.SQLException) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 10 with SchemaConstants

use of edu.uci.ics.texera.api.constants.SchemaConstants in project textdb by TextDB.

the class CSVSink method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    inputOperator.open();
    inputSchema = inputOperator.getOutputSchema();
    outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    fileName = df.format(new Date()) + ".csv";
    File file = new File(csvIndexDirectory.resolve(fileName).toString());
    try {
        if (Files.notExists(csvIndexDirectory)) {
            Files.createDirectories(csvIndexDirectory);
        }
        csvWriter = new CSVWriter(new FileWriter(file));
    } catch (IOException e) {
        throw new DataflowException(e);
    }
    // write csv headers
    List<String> attributeNames = outputSchema.getAttributeNames();
    csvWriter.writeNext(attributeNames.stream().toArray(String[]::new));
    cursor = OPENED;
}
Also used : Files(java.nio.file.Files) Date(java.util.Date) CSVWriter(au.com.bytecode.opencsv.CSVWriter) Tuple(edu.uci.ics.texera.api.tuple.Tuple) FileWriter(java.io.FileWriter) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) SimpleDateFormat(java.text.SimpleDateFormat) IOException(java.io.IOException) File(java.io.File) ArrayList(java.util.ArrayList) List(java.util.List) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) Utils(edu.uci.ics.texera.api.utils.Utils) ISink(edu.uci.ics.texera.api.dataflow.ISink) ErrorMessages(edu.uci.ics.texera.api.constants.ErrorMessages) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Path(java.nio.file.Path) DateFormat(java.text.DateFormat) Schema(edu.uci.ics.texera.api.schema.Schema) SimpleDateFormat(java.text.SimpleDateFormat) DateFormat(java.text.DateFormat) FileWriter(java.io.FileWriter) CSVWriter(au.com.bytecode.opencsv.CSVWriter) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException) SimpleDateFormat(java.text.SimpleDateFormat) File(java.io.File) Date(java.util.Date)

Aggregations

SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)14 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)14 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)14 Attribute (edu.uci.ics.texera.api.schema.Attribute)14 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)14 Schema (edu.uci.ics.texera.api.schema.Schema)14 IField (edu.uci.ics.texera.api.field.IField)12 ArrayList (java.util.ArrayList)12 List (java.util.List)12 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)10 Tuple (edu.uci.ics.texera.api.tuple.Tuple)10 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)8 ISink (edu.uci.ics.texera.api.dataflow.ISink)8 IOException (java.io.IOException)8 Files (java.nio.file.Files)8 Path (java.nio.file.Path)8 Collectors (java.util.stream.Collectors)8 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)6 DateField (edu.uci.ics.texera.api.field.DateField)6 DoubleField (edu.uci.ics.texera.api.field.DoubleField)6