Search in sources :

Example 1 with DataSetRowIterator

use of org.talend.dataprep.api.dataset.json.DataSetRowIterator in project data-prep by Talend.

the class DataSetContentStore method stream.

/**
 * Similarly to {@link #get(DataSetMetadata)} returns the content of the data set but as a {@link Stream stream} of
 * {@link DataSetRow rows} instead of JSON content.
 *
 * @param dataSetMetadata The {@link DataSetMetadata data set} to read rows from.
 * @param limit A limit to pass to raw content supplier (use -1 for "no limit). Used as parameter to call
 * {@link #get(DataSetMetadata, long)}.
 * @return A valid <b>{@link DataSetRow}</b> stream.
 */
public Stream<DataSetRow> stream(DataSetMetadata dataSetMetadata, long limit) {
    final InputStream inputStream = get(dataSetMetadata, limit);
    final DataSetRowIterator iterator = new DataSetRowIterator(inputStream);
    final Iterable<DataSetRow> rowIterable = () -> iterator;
    Stream<DataSetRow> dataSetRowStream = StreamSupport.stream(rowIterable.spliterator(), false);
    // make sure to close the original input stream when closing this one
    AtomicLong tdpId = new AtomicLong(1);
    final List<ColumnMetadata> columns = dataSetMetadata.getRowMetadata().getColumns();
    final Analyzer<Analyzers.Result> analyzer = service.build(columns, AnalyzerService.Analysis.QUALITY);
    dataSetRowStream = dataSetRowStream.filter(r -> !r.isEmpty()).map(r -> {
        final String[] values = r.order(columns).toArray(DataSetRow.SKIP_TDP_ID);
        analyzer.analyze(values);
        return r;
    }).map(// Mark invalid columns as detected by provided analyzer.
    new InvalidMarker(columns, analyzer)).map(r -> {
        // 
        r.setTdpId(tdpId.getAndIncrement());
        return r;
    }).onClose(() -> {
        // 
        try {
            inputStream.close();
        } catch (Exception e) {
            throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
        }
    });
    return dataSetRowStream;
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) DataSetRowIterator(org.talend.dataprep.api.dataset.json.DataSetRowIterator) TDPException(org.talend.dataprep.exception.TDPException) FormatFamilyFactory(org.talend.dataprep.schema.FormatFamilyFactory) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Autowired(org.springframework.beans.factory.annotation.Autowired) DataSetContent(org.talend.dataprep.api.dataset.DataSetContent) Value(org.springframework.beans.factory.annotation.Value) AnalyzerService(org.talend.dataprep.quality.AnalyzerService) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) Stream(java.util.stream.Stream) InvalidMarker(org.talend.dataprep.api.dataset.row.InvalidMarker) Serializer(org.talend.dataprep.schema.Serializer) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow) CommonErrorCodes(org.talend.dataprep.exception.error.CommonErrorCodes) Analyzer(org.talend.dataquality.common.inference.Analyzer) StreamSupport(java.util.stream.StreamSupport) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) InputStream(java.io.InputStream) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DataSetRowIterator(org.talend.dataprep.api.dataset.json.DataSetRowIterator) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) InputStream(java.io.InputStream) TDPException(org.talend.dataprep.exception.TDPException) TDPException(org.talend.dataprep.exception.TDPException) AtomicLong(java.util.concurrent.atomic.AtomicLong) InvalidMarker(org.talend.dataprep.api.dataset.row.InvalidMarker) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow)

Aggregations

ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 InputStream (java.io.InputStream)1 List (java.util.List)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Stream (java.util.stream.Stream)1 StreamSupport (java.util.stream.StreamSupport)1 Autowired (org.springframework.beans.factory.annotation.Autowired)1 Value (org.springframework.beans.factory.annotation.Value)1 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)1 DataSetContent (org.talend.dataprep.api.dataset.DataSetContent)1 DataSetMetadata (org.talend.dataprep.api.dataset.DataSetMetadata)1 DataSetRowIterator (org.talend.dataprep.api.dataset.json.DataSetRowIterator)1 DataSetRow (org.talend.dataprep.api.dataset.row.DataSetRow)1 InvalidMarker (org.talend.dataprep.api.dataset.row.InvalidMarker)1 TDPException (org.talend.dataprep.exception.TDPException)1 CommonErrorCodes (org.talend.dataprep.exception.error.CommonErrorCodes)1 AnalyzerService (org.talend.dataprep.quality.AnalyzerService)1 FormatFamilyFactory (org.talend.dataprep.schema.FormatFamilyFactory)1 Serializer (org.talend.dataprep.schema.Serializer)1 Analyzer (org.talend.dataquality.common.inference.Analyzer)1