use of org.talend.dataprep.schema.Format in project data-prep by Talend.
the class HtmlDetector method detect.
/**
* Reads an input stream and checks if it has a HTML format.
*
* The general contract of a detector is to not close the specified stream before returning. It is to the
* responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
* {@see TikaInputStream} in order to let the stream always return the same bytes.
*
* @param metadata the specified TIKA {@link Metadata}
* @param inputStream the specified input stream
* @return either null or an HTML format
* @throws IOException
*/
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {
if (inputStream == null) {
return null;
} else {
inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
int n = 0;
for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m = inputStream.read(buffer, n, buffer.length - n)) {
n += m;
}
inputStream.reset();
String head = FormatUtils.readFromBuffer(buffer, 0, n);
try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
Charset charset = htmlEncodingDetector.detect(stream, metadata);
if (charset != null) {
return new Format(htmlFormatFamily, charset.name());
}
}
return null;
}
}
use of org.talend.dataprep.schema.Format in project data-prep by Talend.
the class CSVDetector method detect.
/**
* Reads an input stream and checks if it has a CSV format.
*
* The general contract of a detector is to not close the specified stream before returning. It is to the
* responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
* {@see TikaInputStream} in order to let the stream always return the same bytes.
*
* @param metadata the specified TIKA {@link Metadata}
* @param inputStream the specified input stream
* @return either null or an CSV format
* @throws IOException
*/
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {
Format result = detectText(metadata, inputStream);
if (result == null) {
inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
int n = 0;
for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m = inputStream.read(buffer, n, buffer.length - n)) {
n += m;
}
inputStream.reset();
String head = FormatUtils.readFromBuffer(buffer, 0, n);
try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
result = detectText(new Metadata(), stream);
}
}
return result;
}
use of org.talend.dataprep.schema.Format in project data-prep by Talend.
the class CSVDetectorTest method should_detect_CSV_format_and_encoding.
/**
* Standard csv file.
*/
@Test
public void should_detect_CSV_format_and_encoding() throws IOException {
try (InputStream inputStream = this.getClass().getResourceAsStream("standard.csv")) {
Format actual = csvDetector.detect(inputStream);
Assert.assertNotNull(actual);
assertTrue(actual.getFormatFamily() instanceof CSVFormatFamily);
assertEquals("ISO-8859-1", actual.getEncoding());
}
}
use of org.talend.dataprep.schema.Format in project data-prep by Talend.
the class HtmlDetectorTest method guess_html_format_fail.
@Test
public void guess_html_format_fail() throws Exception {
String fileName = "foo.html";
DataSetMetadata datasetMetadata = ioTestUtils.getSimpleDataSetMetadata();
datasetMetadata.setEncoding("UTF-16");
Format actual = htmlDetector.detect(this.getClass().getResourceAsStream(fileName));
assertNull(actual);
}
use of org.talend.dataprep.schema.Format in project data-prep by Talend.
the class HtmlDetectorTest method guess_html_format_success.
@Test
public void guess_html_format_success() throws Exception {
String fileName = "sales-force.xls";
DataSetMetadata datasetMetadata = ioTestUtils.getSimpleDataSetMetadata();
datasetMetadata.setEncoding("UTF-16");
Charset charset = new HtmlEncodingDetector().detect(this.getClass().getResourceAsStream(fileName), new Metadata());
Format actual = htmlDetector.detect(this.getClass().getResourceAsStream(fileName));
assertTrue(actual.getFormatFamily() instanceof HtmlFormatFamily);
assertTrue(StringUtils.equals("UTF-16", actual.getEncoding()));
}
Aggregations