use of org.talend.dataprep.api.dataset.json.DataSetRowIterator in project data-prep by Talend.
the class DataSetContentStore method stream.
/**
* Similarly to {@link #get(DataSetMetadata)} returns the content of the data set but as a {@link Stream stream} of
* {@link DataSetRow rows} instead of JSON content.
*
* @param dataSetMetadata The {@link DataSetMetadata data set} to read rows from.
* @param limit A limit to pass to raw content supplier (use -1 for "no limit). Used as parameter to call
* {@link #get(DataSetMetadata, long)}.
* @return A valid <b>{@link DataSetRow}</b> stream.
*/
public Stream<DataSetRow> stream(DataSetMetadata dataSetMetadata, long limit) {
final InputStream inputStream = get(dataSetMetadata, limit);
final DataSetRowIterator iterator = new DataSetRowIterator(inputStream);
final Iterable<DataSetRow> rowIterable = () -> iterator;
Stream<DataSetRow> dataSetRowStream = StreamSupport.stream(rowIterable.spliterator(), false);
// make sure to close the original input stream when closing this one
AtomicLong tdpId = new AtomicLong(1);
final List<ColumnMetadata> columns = dataSetMetadata.getRowMetadata().getColumns();
final Analyzer<Analyzers.Result> analyzer = service.build(columns, AnalyzerService.Analysis.QUALITY);
dataSetRowStream = dataSetRowStream.filter(r -> !r.isEmpty()).map(r -> {
final String[] values = r.order(columns).toArray(DataSetRow.SKIP_TDP_ID);
analyzer.analyze(values);
return r;
}).map(// Mark invalid columns as detected by provided analyzer.
new InvalidMarker(columns, analyzer)).map(r -> {
//
r.setTdpId(tdpId.getAndIncrement());
return r;
}).onClose(() -> {
//
try {
inputStream.close();
} catch (Exception e) {
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
});
return dataSetRowStream;
}
Aggregations