use of org.semanticdesktop.aperture.rdf.RDFContainer in project stanbol by apache.
the class IksHtmlExtractor method main.
public static void main(String[] args) throws Exception {
int argv = 0;
IksHtmlExtractor inst = new IksHtmlExtractor();
RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
for (int i = argv; i < args.length; ++i) {
File file = new File(args[i]);
InputStream input = new FileInputStream(file);
Charset charset = Charset.forName("UTF-8");
String mimeType = "text/html";
URI uri = new URIImpl(file.toURI().toString());
RDFContainer container = rdfFactory.getRDFContainer(uri);
inst.extract(uri, input, charset, mimeType, container);
System.out.println("Model for " + args[i]);
container.getModel().writeTo(System.out);
System.out.println();
container.dispose();
}
}
use of org.semanticdesktop.aperture.rdf.RDFContainer in project stanbol by apache.
the class SimpleMailExtractor method extractTextFromHtml.
protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
// parse the HTML and extract full-text and metadata
HtmlTextExtractUtil extractor;
try {
extractor = new HtmlTextExtractUtil();
} catch (InitializationException e) {
throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
}
InputStream stream = new ByteArrayInputStream(string.getBytes());
RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
URI id = rdf.getDescribedUri();
RDFContainer result = containerFactory.getRDFContainer(id);
extractor.extract(id, charset, stream, result);
Model meta = result.getModel();
// append metadata and full-text to a string buffer
StringBuilder buffer = new StringBuilder(32 * 1024);
append(buffer, extractor.getTitle(meta), "\n");
append(buffer, extractor.getAuthor(meta), "\n");
append(buffer, extractor.getDescription(meta), "\n");
List<String> keywords = extractor.getKeywords(meta);
for (String kw : keywords) {
append(buffer, kw, " ");
}
buffer.append("\n");
append(buffer, extractor.getText(meta), " ");
logger.debug("text extracted:\n{}", buffer);
meta.close();
// return the buffer's content
return buffer.toString();
}
use of org.semanticdesktop.aperture.rdf.RDFContainer in project stanbol by apache.
the class SimpleMailExtractor method main.
public static void main(String[] args) throws Exception {
int argv = 0;
SimpleMailExtractor extractor = new SimpleMailExtractor();
RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
for (int i = argv; i < args.length; ++i) {
File file = new File(args[i]);
InputStream in = new FileInputStream(file);
URI uri = new URIImpl(file.toURI().toString());
RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
extractor.extract(uri, in, null, null, rdfContainer);
Model model = rdfContainer.getModel();
model.writeTo(System.out, Syntax.RdfXml);
model.close();
}
}
use of org.semanticdesktop.aperture.rdf.RDFContainer in project stanbol by apache.
the class MetaxaCore method extract.
/**
* Returns a model containing all the metadata that could be extracted
* by reading the given input stream using the given MIME type.
*
* @param in
* an {@link InputStream} where to read the document from
* @param docId
* a {@link String} with the document URI
* @param mimeType
* a {@link String} with the MIME type
* @return a {@link Model} containing the metadata or {@code null} if no
* extractor is available for the given MIME type
* @throws ExtractorException
* if there is an error when extracting the metadata
* @throws IOException
* if there is an error when reading the input stream
*/
public Model extract(InputStream in, URIImpl docId, String mimeType) throws ExtractorException, IOException {
@SuppressWarnings("rawtypes") Set factories = this.extractorRegistry.getExtractorFactories(mimeType);
Model result = null;
if (factories != null && !factories.isEmpty()) {
// get extractor from the first available factory
ExtractorFactory factory = (ExtractorFactory) factories.iterator().next();
Extractor extractor = factory.get();
RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
RDFContainer container = containerFactory.getRDFContainer(docId);
extractor.extract(container.getDescribedUri(), new BufferedInputStream(in, 8192), null, mimeType, container);
in.close();
result = container.getModel();
}
return result;
}
Aggregations