use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class HtmlSerializer method deserialize.
private void deserialize(InputStream rawContent, DataSetMetadata dataSetMetadata, OutputStream jsonOutput, long limit) {
try {
List<ColumnMetadata> columns = dataSetMetadata.getRowMetadata().getColumns();
SimpleValuesContentHandler valuesContentHandler = new SimpleValuesContentHandler(columns.size(), limit);
HtmlParser htmlParser = new HtmlParser();
Metadata metadata = new Metadata();
htmlParser.parse(rawContent, valuesContentHandler, metadata, new ParseContext());
JsonGenerator generator = new JsonFactory().createGenerator(jsonOutput);
// start the record
generator.writeStartArray();
for (List<String> values : valuesContentHandler.getValues()) {
if (values.isEmpty()) {
// avoid empty record which can fail analysis
continue;
}
generator.writeStartObject();
int idx = 0;
for (String value : values) {
if (idx < columns.size()) {
ColumnMetadata columnMetadata = columns.get(idx);
generator.writeFieldName(columnMetadata.getId());
if (value != null) {
generator.writeString(value);
} else {
generator.writeNull();
}
idx++;
}
}
generator.writeEndObject();
}
// end the record
generator.writeEndArray();
generator.flush();
} catch (Exception e) {
// Consumer may very well interrupt consumption of stream (in case of limit(n) use for sampling).
// This is not an issue as consumer is allowed to partially consumes results, it's up to the
// consumer to ensure data it consumed is consistent.
LOGGER.debug("Unable to continue serialization for {}. Skipping remaining content.", dataSetMetadata.getId(), e);
} finally {
try {
jsonOutput.close();
} catch (IOException e) {
LOGGER.error("Unable to close output", e);
}
}
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class DatasetUpdateListener method onUpdate.
@EventListener
public void onUpdate(DatasetUpdatedEvent event) {
// when we update a dataset we need to clean cache
final DataSetMetadata dataSetMetadata = event.getSource();
final ContentCacheKey sampleKey = () -> "dataset-sample_" + dataSetMetadata.getId();
LOGGER.debug("Evicting sample cache entry for #{}", dataSetMetadata.getId());
publisher.publishEvent(new CleanCacheEvent(sampleKey));
LOGGER.debug("Evicting sample cache entry for #{} done.", dataSetMetadata.getId());
LOGGER.debug("Evicting transformation cache entry for dataset #{}", dataSetMetadata.getId());
publisher.publishEvent(new CleanCacheEvent(new ContentCacheKey() {
@Override
public String getKey() {
return dataSetMetadata.getId();
}
@Override
public Predicate<String> getMatcher() {
String regex = ".*_" + getKey() + "_.*";
// Build regular expression matcher
final Pattern pattern = Pattern.compile(regex);
return str -> pattern.matcher(str).matches();
}
}, Boolean.TRUE));
LOGGER.debug("Evicting transformation cache entry for dataset #{} done.", dataSetMetadata.getId());
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class XlsWriterTest method createSchemaParser.
/**
* utility function
*/
public SchemaParser.Request createSchemaParser(String inputFileName) throws Exception {
Path path = Files.createTempFile("datarep-foo", "xlsx");
Files.deleteIfExists(path);
try (final OutputStream outputStream = Files.newOutputStream(path)) {
final Configuration configuration = //
Configuration.builder().format(//
XlsFormat.XLSX).output(//
outputStream).actions(//
"").build();
final Transformer exporter = factory.get(configuration);
final InputStream inputStream = XlsWriterTest.class.getResourceAsStream(inputFileName);
try (JsonParser parser = mapper.getFactory().createParser(inputStream)) {
final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser);
exporter.buildExecutable(dataSet, configuration).execute();
}
}
DataSetMetadata metadata = metadataBuilder.metadata().id("123").build();
return new SchemaParser.Request(Files.newInputStream(path), metadata);
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class PreparationDatasetRowUpdaterTest method updatePreparations.
@Test
public void updatePreparations() throws Exception {
// given
String datasetId = "dataset id";
Preparation prep = new Preparation("prepId", "123456");
prep.setDataSetId(datasetId);
final List<Preparation> preparations = singletonList(prep);
when(preparationRepository.list(Preparation.class)).thenReturn(preparations.stream());
DataSetMetadata datasetMetadata = new DataSetMetadata();
datasetMetadata.setRowMetadata(new RowMetadata());
when(dataSetMetadataRepository.get(datasetId)).thenReturn(datasetMetadata);
// when
updater.updatePreparations();
// then
verify(preparationRepository, times(1)).list(Preparation.class);
verify(preparationRepository, times(1)).add(prep);
verify(dataSetMetadataRepository, only()).get(datasetId);
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class PreparationDatasetRowUpdater method addRowMetadata.
/**
* Add the row metadata of the dataset to the preparation.
*
* @param preparation the preparation to update.
* @return the updated preparation.
*/
private Preparation addRowMetadata(Preparation preparation) {
LOGGER.debug("adding row metadata to preparation {}", preparation);
DataSetMetadata dataSetMetadata = dataSetMetadataRepository.get(preparation.getDataSetId());
if (dataSetMetadata != null) {
preparation.setRowMetadata(dataSetMetadata.getRowMetadata());
} else {
LOGGER.debug("The metadata of dataset {} is null and will not be used to set the metadata of preparation {}.", preparation.getDataSetId(), preparation.getId());
}
return preparation;
}
Aggregations