use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.
the class IksHtmlExtractor method main.
public static void main(String[] args) throws Exception {
int argv = 0;
IksHtmlExtractor inst = new IksHtmlExtractor();
RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
for (int i = argv; i < args.length; ++i) {
File file = new File(args[i]);
InputStream input = new FileInputStream(file);
Charset charset = Charset.forName("UTF-8");
String mimeType = "text/html";
URI uri = new URIImpl(file.toURI().toString());
RDFContainer container = rdfFactory.getRDFContainer(uri);
inst.extract(uri, input, charset, mimeType, container);
System.out.println("Model for " + args[i]);
container.getModel().writeTo(System.out);
System.out.println();
container.dispose();
}
}
use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.
the class SimpleMailExtractor method main.
public static void main(String[] args) throws Exception {
int argv = 0;
SimpleMailExtractor extractor = new SimpleMailExtractor();
RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
for (int i = argv; i < args.length; ++i) {
File file = new File(args[i]);
InputStream in = new FileInputStream(file);
URI uri = new URIImpl(file.toURI().toString());
RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
extractor.extract(uri, in, null, null, rdfContainer);
Model model = rdfContainer.getModel();
model.writeTo(System.out, Syntax.RdfXml);
model.close();
}
}
use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.
the class TestMetaxaCore method testRdfaExtraction.
/**
* This tests the html extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testRdfaExtraction() throws Exception {
String testFile = "test-rdfa.html";
String testResultFile = "rdfa-res.txt";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(10, tripleCounter);
}
use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.
the class TestMetaxaCore method testPdfExtraction.
/**
* This tests the pdf extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testPdfExtraction() throws Exception {
String testFile = "test.pdf";
String testResultFile = "pdf-res.txt";
// extract text from pdf
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(11, tripleCounter);
}
use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.
the class MetaxaEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
// get model from the extraction
URIImpl docId;
Model m = null;
ci.getLock().readLock().lock();
try {
docId = new URIImpl(ci.getUri().getUnicodeString());
m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} catch (IOException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} finally {
ci.getLock().readLock().unlock();
}
// the extracted plain text from the model
if (null == m) {
log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
return;
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink("text/plain");
} catch (IOException e) {
m.close();
throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
}
HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
RDF2GoUtils.urifyBlankNodes(m);
ClosableIterator<Statement> it = m.iterator();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
//used to detect if some text was extracted
boolean textExtracted = false;
try {
//first add to a temporary graph
Graph g = new SimpleGraph();
while (it.hasNext()) {
Statement oneStmt = it.next();
//the plain text Blob!
if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
String text = oneStmt.getObject().toString();
if (text != null && !text.isEmpty()) {
try {
out.write(oneStmt.getObject().toString());
} catch (IOException e) {
throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
}
textExtracted = true;
if (includeText) {
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
g.add(new TripleImpl(subject, predicate, object));
}
}
} else {
//add metadata to the metadata of the contentItem
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
if (null != subject && null != predicate && null != object) {
Triple t = new TripleImpl(subject, predicate, object);
g.add(t);
log.debug("added " + t.toString());
}
}
}
//add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
ci.getMetadata().addAll(g);
g = null;
} finally {
ci.getLock().writeLock().unlock();
}
} finally {
it.close();
m.close();
IOUtils.closeQuietly(out);
}
if (textExtracted) {
//add plain text to the content item
IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
ci.addPart(blobUri, plainTextSink.getBlob());
}
}
Aggregations