Search in sources :

Example 91 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TesseractOCRParserTest method offersNoTypesIfNotFound.

/*
    Check that if Tesseract is not found, the TesseractOCRParser claims to not support
    any file types. So, the standard image parser is called instead.
     */
@Test
public void offersNoTypesIfNotFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType png = MediaType.image("png");
    // With an invalid path, will offer no types
    TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
    invalidConfig.setTesseractPath("/made/up/path");
    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, invalidConfig);
    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());
    // And DefaultParser won't use us
    assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) MediaType(org.apache.tika.mime.MediaType) DefaultParser(org.apache.tika.parser.DefaultParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 92 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class PDFParserTest method testInitializationOfNonPrimitivesViaConfig.

@Test
public void testInitializationOfNonPrimitivesViaConfig() throws Exception {
    InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config-non-primitives.xml");
    assertNotNull(is);
    TikaConfig tikaConfig = new TikaConfig(is);
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    Map<MediaType, Parser> parsers = p.getParsers();
    Parser composite = parsers.get(MediaType.application("pdf"));
    Parser pdfParser = ((CompositeParser) composite).getParsers().get(MediaType.application("pdf"));
    assertEquals("org.apache.tika.parser.pdf.PDFParser", pdfParser.getClass().getName());
    assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY, ((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy());
    assertEquals(ImageType.RGB, ((PDFParser) pdfParser).getPDFParserConfig().getOcrImageType());
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 93 with MediaType

use of org.apache.tika.mime.MediaType in project uPortal by Jasig.

the class JaxbPortalDataHandlerService method getMediaType.

protected MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException {
    final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream));
    try {
        final Detector detector = new DefaultDetector();
        final Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
        final MediaType type = detector.detect(tikaInputStreamStream, metadata);
        logger.debug("Determined '{}' for '{}'", type, fileName);
        return type;
    } catch (IOException e) {
        logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
        return null;
    } finally {
        IOUtils.closeQuietly(tikaInputStreamStream);
        // Reset the buffered stream to make up for anything read by the detector
        inputStream.reset();
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) CloseShieldInputStream(org.apache.tika.io.CloseShieldInputStream)

Example 94 with MediaType

use of org.apache.tika.mime.MediaType in project winery by eclipse.

the class WriterUtils method storeDefinitions.

public static void storeDefinitions(Definitions definitions, boolean overwrite, Path dir) {
    Path path = null;
    try {
        path = Files.createTempDirectory("winery");
    } catch (IOException e) {
        e.printStackTrace();
    }
    LOGGER.debug("Store definition: {}", definitions.getId());
    saveDefinitions(definitions, path, definitions.getTargetNamespace(), definitions.getId());
    Definitions cleanDefinitions = loadDefinitions(path, definitions.getTargetNamespace(), definitions.getId());
    CsarImporter csarImporter = new CsarImporter();
    List<Exception> exceptions = new ArrayList<>();
    cleanDefinitions.getServiceTemplateOrNodeTypeOrNodeTypeImplementation().forEach(entry -> {
        String namespace = csarImporter.getNamespace(entry, definitions.getTargetNamespace());
        csarImporter.setNamespace(entry, namespace);
        String id = ModelUtilities.getId(entry);
        Class<? extends DefinitionsChildId> widClazz = Util.getComponentIdClassForTExtensibleElements(entry.getClass());
        final DefinitionsChildId wid = BackendUtils.getDefinitionsChildId(widClazz, namespace, id, false);
        if (RepositoryFactory.getRepository().exists(wid)) {
            if (overwrite) {
                try {
                    RepositoryFactory.getRepository().forceDelete(wid);
                } catch (IOException e) {
                    exceptions.add(e);
                }
            } else {
                return;
            }
        }
        if (entry instanceof TArtifactTemplate) {
            TArtifactTemplate.ArtifactReferences artifactReferences = ((TArtifactTemplate) entry).getArtifactReferences();
            Stream.of(artifactReferences).filter(Objects::nonNull).flatMap(ref -> ref.getArtifactReference().stream()).filter(Objects::nonNull).forEach(ref -> {
                String reference = ref.getReference();
                URI refURI;
                try {
                    refURI = new URI(reference);
                } catch (URISyntaxException e) {
                    LOGGER.error("Invalid URI {}", reference);
                    return;
                }
                if (refURI.isAbsolute()) {
                    return;
                }
                Path artifactPath = dir.resolve(reference);
                if (!Files.exists(artifactPath)) {
                    LOGGER.error("File not found {}", artifactPath);
                    return;
                }
                ArtifactTemplateFilesDirectoryId aDir = new ArtifactTemplateFilesDirectoryId((ArtifactTemplateId) wid);
                RepositoryFileReference aFile = new RepositoryFileReference(aDir, artifactPath.getFileName().toString());
                MediaType mediaType = null;
                try (InputStream is = Files.newInputStream(artifactPath);
                    BufferedInputStream bis = new BufferedInputStream(is)) {
                    mediaType = BackendUtils.getMimeType(bis, artifactPath.getFileName().toString());
                    RepositoryFactory.getRepository().putContentToFile(aFile, bis, mediaType);
                } catch (IOException e) {
                    LOGGER.error("Could not read artifact template file: {}", artifactPath);
                    return;
                }
            });
        }
        final Definitions part = BackendUtils.createWrapperDefinitions(wid);
        part.getServiceTemplateOrNodeTypeOrNodeTypeImplementation().add(entry);
        RepositoryFileReference ref = BackendUtils.getRefOfDefinitions(wid);
        String content = BackendUtils.getXMLAsString(part, true);
        try {
            RepositoryFactory.getRepository().putContentToFile(ref, content, MediaTypes.MEDIATYPE_TOSCA_DEFINITIONS);
        } catch (Exception e) {
            exceptions.add(e);
        }
    });
}
Also used : Path(java.nio.file.Path) DefinitionsChildId(org.eclipse.winery.common.ids.definitions.DefinitionsChildId) CsarImporter(org.eclipse.winery.repository.importing.CsarImporter) TArtifactTemplate(org.eclipse.winery.model.tosca.TArtifactTemplate) Definitions(org.eclipse.winery.model.tosca.Definitions) TDefinitions(org.eclipse.winery.model.tosca.TDefinitions) ArrayList(java.util.ArrayList) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) TransformerException(javax.xml.transform.TransformerException) URISyntaxException(java.net.URISyntaxException) JAXBException(javax.xml.bind.JAXBException) ArtifactTemplateFilesDirectoryId(org.eclipse.winery.repository.datatypes.ids.elements.ArtifactTemplateFilesDirectoryId) RepositoryFileReference(org.eclipse.winery.common.RepositoryFileReference) MediaType(org.apache.tika.mime.MediaType)

Example 95 with MediaType

use of org.apache.tika.mime.MediaType in project winery by eclipse.

the class CsarImporter method importOtherImport.

/**
 * SIDE EFFECT: modifies the location of imp to point to the correct relative location (when read from the exported
 * CSAR)
 *
 * @param rootPath the absolute path where to resolve files from
 */
private void importOtherImport(Path rootPath, TImport imp, final List<String> errors, String type, boolean overwrite) {
    assert (!type.equals(Namespaces.TOSCA_NAMESPACE));
    String loc = imp.getLocation();
    if (!Util.isRelativeURI(loc)) {
        // This is just an information message
        errors.add("Absolute URIs are not resolved by Winery (" + loc + ")");
        return;
    }
    // location URLs are encoded: http://www.w3.org/TR/2001/WD-charmod-20010126/#sec-URIs, RFC http://www.ietf.org/rfc/rfc2396.txt
    loc = Util.URLdecode(loc);
    Path path;
    try {
        path = rootPath.resolve(loc);
    } catch (Exception e) {
        // java.nio.file.InvalidPathException could be thrown which is a RuntimeException
        errors.add(e.getMessage());
        return;
    }
    if (!Files.exists(path)) {
        // fallback for older CSARs, where the location is given from the root
        path = rootPath.getParent().resolve(loc);
        if (!Files.exists(path)) {
            errors.add(String.format("File %1$s does not exist", loc));
            return;
        }
    }
    String namespace = imp.getNamespace();
    String fileName = path.getFileName().toString();
    String id = fileName;
    id = FilenameUtils.removeExtension(id);
    // Convention: id of import is filename without extension
    GenericImportId rid;
    if (type.equals(XMLConstants.W3C_XML_SCHEMA_NS_URI)) {
        rid = new XSDImportId(namespace, id, false);
    } else {
        rid = new GenericImportId(namespace, id, false, type);
    }
    boolean importDataExistsInRepo = RepositoryFactory.getRepository().exists(rid);
    if (!importDataExistsInRepo) {
        // We have to
        // a) create a .definitions file
        // b) put the file itself in the repo
        // Create the definitions file
        TDefinitions defs = BackendUtils.createWrapperDefinitions(rid);
        defs.getImport().add(imp);
        // QUICK HACK: We change the imp object's location here and below again
        // This is "OK" as "storeDefinitions" serializes the current state and not the future state of the imp object
        // change the location to point to the file in the folder of the .definitions file
        imp.setLocation(fileName);
        // put the definitions file to the repository
        CsarImporter.storeDefinitions(rid, defs);
    }
    // put the file itself to the repo
    // ref is required to generate fileRef
    RepositoryFileReference ref = BackendUtils.getRefOfDefinitions(rid);
    RepositoryFileReference fileRef = new RepositoryFileReference(ref.getParent(), fileName);
    // location is relative to Definitions/
    // even if the import already exists, we have to adapt the path
    // URIs are encoded
    String newLoc = "../" + Util.getUrlPath(fileRef);
    imp.setLocation(newLoc);
    if (!importDataExistsInRepo || overwrite) {
        // finally write the file to the storage
        try (InputStream is = Files.newInputStream(path);
            BufferedInputStream bis = new BufferedInputStream(is)) {
            MediaType mediaType;
            if (type.equals(XMLConstants.W3C_XML_SCHEMA_NS_URI)) {
                mediaType = MediaTypes.MEDIATYPE_XSD;
            } else {
                mediaType = BackendUtils.getMimeType(bis, path.getFileName().toString());
            }
            RepositoryFactory.getRepository().putContentToFile(fileRef, bis, mediaType);
        } catch (IllegalArgumentException | IOException e) {
            throw new IllegalStateException(e);
        }
        // we have to update the cache in case of a new XSD to speedup usage of winery
        if (rid instanceof XSDImportId) {
            // We do the initialization asynchronously
            // We do not check whether the XSD has already been checked
            // We cannot just checck whether an XSD already has been handled since the XSD could change over time
            // Synchronization at org.eclipse.winery.repository.resources.imports.xsdimports.XSDImportResource.getAllDefinedLocalNames(short) also isn't feasible as the backend doesn't support locks
            CsarImporter.xsdParsingService.submit(() -> {
                CsarImporter.LOGGER.debug("Updating XSD import cache data");
                // We call the queries without storing the result:
                // We use the SIDEEFFECT that a cache is created
                final XsdImportManager xsdImportManager = RepositoryFactory.getRepository().getXsdImportManager();
                xsdImportManager.getAllDeclaredElementsLocalNames();
                xsdImportManager.getAllDefinedTypesLocalNames();
                CsarImporter.LOGGER.debug("Updated XSD import cache data");
            });
        }
    }
}
Also used : XSDImportId(org.eclipse.winery.common.ids.definitions.imports.XSDImportId) BufferedInputStream(java.io.BufferedInputStream) ZipInputStream(java.util.zip.ZipInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) JAXBException(javax.xml.bind.JAXBException) ConfigurationException(org.apache.commons.configuration.ConfigurationException) IOException(java.io.IOException) GenericImportId(org.eclipse.winery.common.ids.definitions.imports.GenericImportId) RepositoryFileReference(org.eclipse.winery.common.RepositoryFileReference) BufferedInputStream(java.io.BufferedInputStream) MediaType(org.apache.tika.mime.MediaType) XsdImportManager(org.eclipse.winery.repository.backend.xsd.XsdImportManager)

Aggregations

MediaType (org.apache.tika.mime.MediaType)95 Metadata (org.apache.tika.metadata.Metadata)29 Test (org.junit.Test)28 InputStream (java.io.InputStream)26 IOException (java.io.IOException)18 Parser (org.apache.tika.parser.Parser)18 TikaInputStream (org.apache.tika.io.TikaInputStream)17 ParseContext (org.apache.tika.parser.ParseContext)17 TikaException (org.apache.tika.exception.TikaException)14 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)14 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 Detector (org.apache.tika.detect.Detector)11 TikaTest (org.apache.tika.TikaTest)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 ArrayList (java.util.ArrayList)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7