use of org.apache.tika.parser.html.HtmlParser in project acs-aem-commons by Adobe-Consulting-Services.
the class BrokenLinksReport method collectPaths.
/**
* Collect references from a JCR property.
* A property can be one of:
* <ol>
* <li>A string containing a reference, e.g, fileReference=/content/dam/image.png. </li>
* <li>An array of strings, e.g, fileReference=[/content/dam/image1.png, /content/dam/image2.png]</li>
* <li>An html fragment containing links , e.g,
* <pre>
* <p>
* <a href="/content/site/page.html">hello</a>
* <img src="/content/dam/image1.png">hello</a>
* </p>
* </pre>
* </li>
* </ol>
*
* @param property an entry from a ValueMap
* @param htmlFields lst of properties containing html
* @return stream containing extracted references
*/
static Stream<String> collectPaths(Map.Entry<String, Object> property, Set<String> htmlFields) {
Object p = property.getValue();
Stream<String> stream;
if (p.getClass() == String[].class) {
stream = Arrays.stream((String[]) p);
} else if (p.getClass() == String.class) {
stream = Stream.of((String) p);
} else {
stream = Stream.empty();
}
if (htmlFields.contains(property.getKey())) {
stream = stream.flatMap(val -> {
try {
// parse html and extract links via underlying tagsoup library
LinkContentHandler linkHandler = new LinkContentHandler();
HtmlParser parser = new HtmlParser();
parser.parse(new ByteArrayInputStream(val.getBytes("utf-8")), linkHandler, new Metadata(), new ParseContext());
return linkHandler.getLinks().stream().map(Link::getUri);
} catch (Exception e) {
return Stream.empty();
}
});
}
return stream;
}
use of org.apache.tika.parser.html.HtmlParser in project data-prep by Talend.
the class HtmlSerializer method deserialize.
private void deserialize(InputStream rawContent, DataSetMetadata dataSetMetadata, OutputStream jsonOutput, long limit) {
try {
List<ColumnMetadata> columns = dataSetMetadata.getRowMetadata().getColumns();
SimpleValuesContentHandler valuesContentHandler = new SimpleValuesContentHandler(columns.size(), limit);
HtmlParser htmlParser = new HtmlParser();
Metadata metadata = new Metadata();
htmlParser.parse(rawContent, valuesContentHandler, metadata, new ParseContext());
JsonGenerator generator = new JsonFactory().createGenerator(jsonOutput);
// start the record
generator.writeStartArray();
for (List<String> values : valuesContentHandler.getValues()) {
if (values.isEmpty()) {
// avoid empty record which can fail analysis
continue;
}
generator.writeStartObject();
int idx = 0;
for (String value : values) {
if (idx < columns.size()) {
ColumnMetadata columnMetadata = columns.get(idx);
generator.writeFieldName(columnMetadata.getId());
if (value != null) {
generator.writeString(value);
} else {
generator.writeNull();
}
idx++;
}
}
generator.writeEndObject();
}
// end the record
generator.writeEndArray();
generator.flush();
} catch (Exception e) {
// Consumer may very well interrupt consumption of stream (in case of limit(n) use for sampling).
// This is not an issue as consumer is allowed to partially consumes results, it's up to the
// consumer to ensure data it consumed is consistent.
LOGGER.debug("Unable to continue serialization for {}. Skipping remaining content.", dataSetMetadata.getId(), e);
} finally {
try {
jsonOutput.close();
} catch (IOException e) {
LOGGER.error("Unable to close output", e);
}
}
}
use of org.apache.tika.parser.html.HtmlParser in project data-prep by Talend.
the class HtmlSchemaParser method parse.
/**
* @see SchemaParser#parse(Request)
*/
@Override
public Schema parse(Request request) {
try {
SimpleHeadersContentHandler headersContentHandler = new SimpleHeadersContentHandler();
InputStream inputStream = request.getContent();
HtmlParser htmlParser = new HtmlParser();
Metadata metadata = new Metadata();
htmlParser.parse(inputStream, headersContentHandler, metadata, new ParseContext());
List<ColumnMetadata> columns = new ArrayList<>(headersContentHandler.getHeaderValues().size());
for (String headerValue : headersContentHandler.getHeaderValues()) {
columns.add(ColumnMetadata.Builder.column().type(// ATM not doing any complicated type calculation
Type.STRING).name(//
headerValue).id(//
columns.size()).build());
}
Schema.SheetContent sheetContent = new Schema.SheetContent();
sheetContent.setColumnMetadatas(columns);
return Schema.Builder.parserResult().sheetContents(//
Collections.singletonList(sheetContent)).draft(//
false).build();
} catch (Exception e) {
LOGGER.debug("Exception during parsing html request :" + e.getMessage(), e);
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
}
Aggregations