use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class MetaxaCore method extract.
/**
* Returns a model containing all the metadata that could be extracted
* by reading the given input stream using the given MIME type.
*
* @param in
* an {@link InputStream} where to read the document from
* @param docId
* a {@link String} with the document URI
* @param mimeType
* a {@link String} with the MIME type
* @return a {@link Model} containing the metadata or {@code null} if no
* extractor is available for the given MIME type
* @throws ExtractorException
* if there is an error when extracting the metadata
* @throws IOException
* if there is an error when reading the input stream
*/
public Model extract(InputStream in, URIImpl docId, String mimeType) throws ExtractorException, IOException {
@SuppressWarnings("rawtypes") Set factories = this.extractorRegistry.getExtractorFactories(mimeType);
Model result = null;
if (factories != null && !factories.isEmpty()) {
// get extractor from the first available factory
ExtractorFactory factory = (ExtractorFactory) factories.iterator().next();
Extractor extractor = factory.get();
RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
RDFContainer container = containerFactory.getRDFContainer(docId);
extractor.extract(container.getDescribedUri(), new BufferedInputStream(in, 8192), null, mimeType, container);
in.close();
result = container.getModel();
}
return result;
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class MP3FileExtractor method addSimpleContact.
protected void addSimpleContact(URI property, String fullname, RDFContainer container) {
Model model = container.getModel();
RDFTerm resource = ModelUtil.generateRandomResource(model);
model.addStatement(resource, RDF.type, NCO.Contact);
model.addStatement(resource, NCO.fullname, fullname);
model.addStatement(container.getDescribedUri(), property, resource);
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class TestMetaxaCore method testMailExtraction.
@Test
public void testMailExtraction() throws Exception {
String testFile = "mail-multipart-test.eml";
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "message/rfc822");
boolean textContained = m.contains(Variable.ANY, NMO.plainTextMessageContent, Variable.ANY);
assertTrue(textContained);
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class TestMetaxaCore method testHtmlExtraction.
/**
* This tests the html extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testHtmlExtraction() throws Exception {
String testFile = "test.html";
String testResultFile = "html-res.txt";
// extract text from html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(28, tripleCounter);
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class TestMetaxaCore method testPdfExtraction.
/**
* This tests the pdf extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testPdfExtraction() throws Exception {
String testFile = "test.pdf";
String testResultFile = "pdf-res.txt";
// extract text from pdf
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(11, tripleCounter);
}
Aggregations