Examples with XML2ContentHandler - de.catma.document.source.contenthandler.XML2ContentHandler

Example 1 with XML2ContentHandler

use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.

the class ProjectView method addUploadFile.

private void addUploadFile(UploadFile uploadFile, boolean useApostropheAsSeparator, String collectionNamePattern) {
    SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(uploadFile.getIndexInfoSet(useApostropheAsSeparator), uploadFile.getContentInfoSet(), uploadFile.getTechInfoSet());
    SourceContentHandler contentHandler = sourceDocumentInfo.getTechInfoSet().getMimeType().equals(FileType.XML2.getMimeType()) ? new XML2ContentHandler() : new TikaContentHandler();
    contentHandler.setSourceDocumentInfo(sourceDocumentInfo);
    SourceDocument document = new SourceDocument(uploadFile.getUuid(), contentHandler);
    try {
        String content = document.getContent();
        FileOSType fileOSType = FileOSType.getFileOSType(content);
        sourceDocumentInfo.getTechInfoSet().setFileOSType(fileOSType);
        CRC32 checksum = new CRC32();
        checksum.update(content.getBytes());
        sourceDocumentInfo.getTechInfoSet().setChecksum(checksum.getValue());
        project.insert(document);
        AnnotationCollection intrinsicMarkupCollection = uploadFile.getIntrinsicMarkupCollection();
        if (intrinsicMarkupCollection != null) {
            project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
        }
        if (collectionNamePattern != null && !collectionNamePattern.isEmpty()) {
            String collectionName = collectionNamePattern.replace("{{Title}}", uploadFile.getTitle());
            project.createUserMarkupCollection(collectionName, document);
        }
    } catch (IOException e) {
        Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error loading content of %1$s", uploadFile.getTempFilename().toString()), e);
        String errorMsg = e.getMessage();
        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
            errorMsg = "";
        }
        Notification.show("Error", String.format("Error loading content of %1$s! " + "This document will be skipped!\n The underlying error message was:\n%2$s", uploadFile.getTitle(), errorMsg), Type.ERROR_MESSAGE);
    }
}

Also used : AnnotationCollection(de.catma.document.annotation.AnnotationCollection) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) CRC32(java.util.zip.CRC32) TikaContentHandler(de.catma.document.source.contenthandler.TikaContentHandler) SourceDocument(de.catma.document.source.SourceDocument) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) FileOSType(de.catma.document.source.FileOSType) IOException(java.io.IOException) SourceContentHandler(de.catma.document.source.contenthandler.SourceContentHandler)

Example 2 with XML2ContentHandler

use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.

the class ImportIntrinsicMarkupStep method enter.

@Override
public void enter(boolean back) {
    if (back) {
        return;
    }
    contentPanel.setEnabled(false);
    progressBar.setVisible(true);
    progressBar.setIndeterminate(true);
    @SuppressWarnings("unchecked") final ArrayList<UploadFile> files = new ArrayList<UploadFile>(((Collection<UploadFile>) wizardContext.get(DocumentWizard.WizardContextKey.UPLOAD_FILE_LIST)).stream().filter(uploadFile -> uploadFile.getMimetype().equals(FileType.XML2.getMimeType())).collect(Collectors.toList()));
    final TagManager tagmanager = new TagManager(new TagLibrary());
    BackgroundServiceProvider backgroundServiceProvider = (BackgroundServiceProvider) UI.getCurrent();
    backgroundServiceProvider.submit("inspecting-intrinsic-markup", new DefaultProgressCallable<List<UploadFile>>() {

        @Override
        public List<UploadFile> call() throws Exception {
            IDGenerator idGenerator = new IDGenerator();
            for (UploadFile uploadFile : files) {
                XML2ContentHandler contentHandler = new XML2ContentHandler();
                SourceDocument doc = new SourceDocument(uploadFile.getUuid(), contentHandler);
                SourceDocumentInfo documentInfo = new SourceDocumentInfo();
                TechInfoSet techInfoSet = new TechInfoSet();
                techInfoSet.setURI(uploadFile.getTempFilename());
                documentInfo.setTechInfoSet(techInfoSet);
                contentHandler.setSourceDocumentInfo(documentInfo);
                XmlMarkupCollectionSerializationHandler handler = new XmlMarkupCollectionSerializationHandler(tagmanager, contentHandler, project.getUser().getIdentifier());
                try (FileInputStream fis = new FileInputStream(new File(uploadFile.getTempFilename()))) {
                    AnnotationCollection collection = handler.deserialize(doc, idGenerator.generateCollectionId(), fis);
                    uploadFile.setIntrinsicMarkupCollection(collection);
                }
            }
            return files;
        }
    }, new ExecutionListener<List<UploadFile>>() {

        @Override
        public void done(List<UploadFile> result) {
            contentPanel.setEnabled(true);
            progressBar.setVisible(false);
            progressBar.setIndeterminate(false);
            fileList.clear();
            fileList.addAll(result);
            fileDataProvider.refreshAll();
            tagsetImportList.clear();
            String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
            for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
                if (!tagset.isEmpty()) {
                    TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
                    boolean inProject = false;
                    if (targetTagset == null) {
                        targetTagset = tagset;
                    } else {
                        inProject = true;
                    }
                    String namespace = tagset.getName() == null ? "none" : tagset.getName();
                    if (tagset.getName() == null) {
                        tagset.setName(defaultIntrinsicXMLElmentsName);
                    }
                    TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
                    tagsetImportList.add(tagsetImport);
                }
            }
            tagsetDataProvider.refreshAll();
            wizardContext.put(DocumentWizard.WizardContextKey.TAGSET_IMPORT_LIST, tagsetImportList);
            if (stepChangeListener != null) {
                stepChangeListener.stepChanged(ImportIntrinsicMarkupStep.this);
            }
        }

        @Override
        public void error(Throwable t) {
            Logger.getLogger(ImportIntrinsicMarkupStep.class.getName()).log(Level.SEVERE, "Error inspecting files", t);
            String errorMsg = t.getMessage();
            if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                errorMsg = "";
            }
            Notification.show("Error", String.format("Error inspecting the contents! " + "\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
        }
    });
}

Also used : SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) ArrayList(java.util.ArrayList) BackgroundServiceProvider(de.catma.backgroundservice.BackgroundServiceProvider) ArrayList(java.util.ArrayList) List(java.util.List) TechInfoSet(de.catma.document.source.TechInfoSet) TagLibrary(de.catma.tag.TagLibrary) AnnotationCollection(de.catma.document.annotation.AnnotationCollection) SourceDocument(de.catma.document.source.SourceDocument) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) FileInputStream(java.io.FileInputStream) TagsetDefinition(de.catma.tag.TagsetDefinition) TagManager(de.catma.tag.TagManager) XmlMarkupCollectionSerializationHandler(de.catma.serialization.intrinsic.xml.XmlMarkupCollectionSerializationHandler) IDGenerator(de.catma.util.IDGenerator) File(java.io.File)

Example 3 with XML2ContentHandler

use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.

the class CorpusImporter method importCorpus.

/**
 * !BACKGROUND THREAD! No direct UI code here!
 *
 * @param progressListener
 * @param corpusFile
 * @param documentMetadataList
 * @param tempDir
 * @param ui
 * @param project
 * @return
 * @throws Exception
 */
public Void importCorpus(final ProgressListener progressListener, final File corpusFile, final List<CorpusImportDocumentMetadata> documentMetadataList, final String tempDir, final UI ui, final Project project) throws Exception {
    progressListener.setProgress("Importing Corpus");
    GZIPInputStream gzipIs = new GZIPInputStream(new FileInputStream(corpusFile));
    try (TarArchiveInputStream taIs = new TarArchiveInputStream(gzipIs)) {
        TarArchiveEntry entry = taIs.getNextTarEntry();
        while (entry != null) {
            final String entryName = entry.getName();
            final String[] pathParts = entry.getName().split(Pattern.quote("/"));
            final String documentIdPart = pathParts[2];
            final String documentId = documentIdPart.substring(documentIdPart.indexOf("__") + 3);
            final String idUri = "catma://" + documentId;
            if (pathParts[3].equals("annotationcollections")) {
                progressListener.setProgress("Importing Collection %1$s", pathParts[4]);
                ui.accessSynchronously(() -> {
                    try {
                        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
                        IOUtils.copy(taIs, buffer);
                        SourceDocument document = project.getSourceDocument(documentId);
                        Pair<AnnotationCollection, List<TagsetDefinitionImportStatus>> loadResult = project.loadAnnotationCollection(new ByteArrayInputStream(buffer.toByteArray()), document);
                        List<TagsetDefinitionImportStatus> tagsetDefinitionImportStatusList = loadResult.getSecond();
                        final AnnotationCollection annotationCollection = loadResult.getFirst();
                        Optional<TagsetDefinition> optIntrinsicTagset = annotationCollection.getTagLibrary().getTagsetDefinitions().stream().filter(tagsetDef -> tagsetDef.getName().equals("Intrinsic Markup")).findFirst();
                        if (optIntrinsicTagset.isPresent()) {
                            TagsetDefinition intrinsicTagset = optIntrinsicTagset.get();
                            List<TagReference> intrinsicAnnotations = annotationCollection.getTagReferences(intrinsicTagset);
                            if (!intrinsicAnnotations.isEmpty()) {
                                annotationCollection.removeTagReferences(intrinsicAnnotations);
                            }
                            annotationCollection.getTagLibrary().remove(intrinsicTagset);
                            tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().equals(intrinsicTagset)).findFirst().ifPresent(status -> status.setDoImport(false));
                        }
                        tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().isEmpty()).forEach(status -> status.setDoImport(false));
                        if (!annotationCollection.isEmpty()) {
                            project.importCollection(tagsetDefinitionImportStatusList, annotationCollection);
                        }
                    } catch (Exception e) {
                        Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
                        String errorMsg = e.getMessage();
                        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                            errorMsg = "";
                        }
                        Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Collection will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
                    }
                });
            } else {
                final CorpusImportDocumentMetadata documentMetadata = documentMetadataList.stream().filter(metadata -> metadata.getSourceDocID().equals(idUri)).findFirst().orElse(null);
                final Locale locale = LocaleUtils.toLocale(documentMetadata.getSourceDocLocale());
                final boolean useApostrophe = Arrays.asList(documentMetadata.getSourceDocSepChars()).contains(String.valueOf(UploadFile.APOSTROPHE));
                final String title = (documentMetadata.getSourceDocName() == null || documentMetadata.getSourceDocName().isEmpty()) ? documentId : documentMetadata.getSourceDocName();
                progressListener.setProgress("Importing Document %1$s", title);
                final File tempFile = new File(new File(tempDir), documentId);
                if (tempFile.exists()) {
                    tempFile.delete();
                }
                try (FileOutputStream fos = new FileOutputStream(tempFile)) {
                    IOUtils.copy(taIs, fos);
                }
                ui.accessSynchronously(() -> {
                    IDGenerator idGenerator = new IDGenerator();
                    IndexInfoSet indexInfoSet = new IndexInfoSet(Collections.emptyList(), useApostrophe ? Lists.newArrayList(UploadFile.APOSTROPHE) : Collections.emptyList(), locale);
                    TechInfoSet techInfoSet = new TechInfoSet(documentId, FileType.TEXT.getMimeType(), tempFile.toURI());
                    ContentInfoSet contentInfoSet = new ContentInfoSet(documentMetadata.getSourceDocAuthor(), documentMetadata.getSourceDocDescription(), documentMetadata.getSourceDocPublisher(), title);
                    techInfoSet.setCharset(Charset.forName("UTF-8"));
                    SourceDocumentInfo documentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
                    AbstractSourceContentHandler handler = null;
                    boolean loadIntrinsicMarkup = false;
                    if (entryName.endsWith("xml2")) {
                        handler = new XML2ContentHandler();
                        loadIntrinsicMarkup = true;
                    } else if (entryName.endsWith("xml")) {
                        handler = new OldXMLContentHandler();
                        loadIntrinsicMarkup = true;
                    } else {
                        handler = new StandardContentHandler();
                    }
                    handler.setSourceDocumentInfo(documentInfo);
                    SourceDocument document = new SourceDocument(documentId, handler);
                    try {
                        project.insert(document, false);
                        if (loadIntrinsicMarkup) {
                            final TagManager tagmanager = new TagManager(new TagLibrary());
                            XmlMarkupCollectionSerializationHandler markupHandler = new XmlMarkupCollectionSerializationHandler(tagmanager, (XML2ContentHandler) handler, project.getUser().getIdentifier());
                            try (FileInputStream fis = new FileInputStream(tempFile)) {
                                AnnotationCollection intrinsicMarkupCollection = markupHandler.deserialize(document, idGenerator.generateCollectionId(), fis);
                                Collection<TagsetImport> tagsetImports = new ArrayList<TagsetImport>();
                                String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
                                for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
                                    if (!tagset.isEmpty()) {
                                        TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
                                        boolean inProject = false;
                                        if (targetTagset == null) {
                                            targetTagset = tagset;
                                        } else {
                                            inProject = true;
                                        }
                                        String namespace = tagset.getName() == null ? "none" : tagset.getName();
                                        if (tagset.getName() == null) {
                                            tagset.setName(defaultIntrinsicXMLElmentsName);
                                        }
                                        TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
                                        tagsetImports.add(tagsetImport);
                                    }
                                }
                                // Creating Tagsets
                                tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_CREATED)).forEach(tagsetImport -> {
                                    if (project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid()) != null) {
                                        // already imported, so it will be a merge
                                        tagsetImport.setImportState(TagsetImportState.WILL_BE_MERGED);
                                    } else {
                                        TagsetDefinition extractedTagset = tagsetImport.getExtractedTagset();
                                        try {
                                            project.importTagsets(Collections.singletonList(new TagsetDefinitionImportStatus(extractedTagset, project.inProjectHistory(extractedTagset.getUuid()), project.getTagManager().getTagLibrary().getTagsetDefinition(extractedTagset.getUuid()) != null)));
                                        } catch (Exception e) {
                                            Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error importing tagset %1$s with ID %2$s", extractedTagset.getName(), extractedTagset.getUuid()), e);
                                            String errorMsg = e.getMessage();
                                            if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                                                errorMsg = "";
                                            }
                                            Notification.show("Error", String.format("Error importing tagset %1$s! " + "This tagset will be skipped!\n The underlying error message was:\n%2$s", extractedTagset.getName(), errorMsg), Type.ERROR_MESSAGE);
                                        }
                                    }
                                });
                                // Merging Tagsets
                                tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_MERGED)).forEach(tagsetImport -> {
                                    TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid());
                                    for (TagDefinition tag : tagsetImport.getExtractedTagset()) {
                                        Optional<TagDefinition> optionalTag = targetTagset.getTagDefinitionsByName(tag.getName()).findFirst();
                                        if (optionalTag.isPresent()) {
                                            TagDefinition existingTag = optionalTag.get();
                                            tag.getUserDefinedPropertyDefinitions().forEach(pd -> {
                                                if (existingTag.getPropertyDefinition(pd.getName()) == null) {
                                                    project.getTagManager().addUserDefinedPropertyDefinition(existingTag, new PropertyDefinition(pd));
                                                }
                                            });
                                            List<TagReference> tagReferences = intrinsicMarkupCollection.getTagReferences(tag);
                                            intrinsicMarkupCollection.removeTagReferences(tagReferences);
                                            Multimap<TagInstance, TagReference> referencesByInstance = ArrayListMultimap.create();
                                            tagReferences.forEach(tr -> referencesByInstance.put(tr.getTagInstance(), tr));
                                            for (TagInstance incomingTagInstance : referencesByInstance.keySet()) {
                                                TagInstance newTagInstance = new TagInstance(idGenerator.generate(), existingTag.getUuid(), incomingTagInstance.getAuthor(), incomingTagInstance.getTimestamp(), existingTag.getUserDefinedPropertyDefinitions(), targetTagset.getUuid());
                                                for (Property oldProp : incomingTagInstance.getUserDefinedProperties()) {
                                                    String oldPropDefId = oldProp.getPropertyDefinitionId();
                                                    PropertyDefinition oldPropDef = tag.getPropertyDefinitionByUuid(oldPropDefId);
                                                    PropertyDefinition existingPropDef = existingTag.getPropertyDefinition(oldPropDef.getName());
                                                    newTagInstance.addUserDefinedProperty(new Property(existingPropDef.getUuid(), oldProp.getPropertyValueList()));
                                                }
                                                ArrayList<TagReference> newReferences = new ArrayList<>();
                                                referencesByInstance.get(incomingTagInstance).forEach(tr -> {
                                                    try {
                                                        newReferences.add(new TagReference(newTagInstance, tr.getTarget().toString(), tr.getRange(), tr.getUserMarkupCollectionUuid()));
                                                    } catch (URISyntaxException e) {
                                                        e.printStackTrace();
                                                    }
                                                });
                                                intrinsicMarkupCollection.addTagReferences(newReferences);
                                            }
                                        } else {
                                            tag.setTagsetDefinitionUuid(targetTagset.getUuid());
                                            project.getTagManager().addTagDefinition(targetTagset, tag);
                                        }
                                    }
                                });
                                project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
                            }
                            if (tempFile.exists()) {
                                tempFile.delete();
                            }
                        }
                    } catch (Exception e) {
                        Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
                        String errorMsg = e.getMessage();
                        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                            errorMsg = "";
                        }
                        Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Document will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
                    }
                });
            }
            entry = taIs.getNextTarEntry();
        }
    }
    return null;
}

Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) AbstractSourceContentHandler(de.catma.document.source.contenthandler.AbstractSourceContentHandler) Arrays(java.util.Arrays) GZIPInputStream(java.util.zip.GZIPInputStream) URISyntaxException(java.net.URISyntaxException) UI(com.vaadin.ui.UI) XmlMarkupCollectionSerializationHandler(de.catma.serialization.intrinsic.xml.XmlMarkupCollectionSerializationHandler) TechInfoSet(de.catma.document.source.TechInfoSet) OldXMLContentHandler(de.catma.document.source.contenthandler.OldXMLContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Locale(java.util.Locale) TagsetImport(de.catma.ui.module.project.documentwizard.TagsetImport) Collection(java.util.Collection) IndexInfoSet(de.catma.document.source.IndexInfoSet) ProjectView(de.catma.ui.module.project.ProjectView) TagInstance(de.catma.tag.TagInstance) Logger(java.util.logging.Logger) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) TagReference(de.catma.document.annotation.TagReference) List(java.util.List) Type(com.vaadin.ui.Notification.Type) TagDefinition(de.catma.tag.TagDefinition) Optional(java.util.Optional) FileType(de.catma.document.source.FileType) Pattern(java.util.regex.Pattern) ContentInfoSet(de.catma.document.source.ContentInfoSet) PropertyDefinition(de.catma.tag.PropertyDefinition) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TagManager(de.catma.tag.TagManager) UploadFile(de.catma.ui.module.project.documentwizard.UploadFile) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) Multimap(com.google.common.collect.Multimap) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) LocaleUtils(org.apache.commons.lang3.LocaleUtils) Lists(com.google.common.collect.Lists) Charset(java.nio.charset.Charset) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) Notification(com.vaadin.ui.Notification) StandardContentHandler(de.catma.document.source.contenthandler.StandardContentHandler) TagsetDefinition(de.catma.tag.TagsetDefinition) Pair(de.catma.util.Pair) IDGenerator(de.catma.util.IDGenerator) TagLibrary(de.catma.tag.TagLibrary) ProgressListener(de.catma.backgroundservice.ProgressListener) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) Property(de.catma.tag.Property) Project(de.catma.project.Project) FileOutputStream(java.io.FileOutputStream) IOUtils(org.apache.commons.compress.utils.IOUtils) FileInputStream(java.io.FileInputStream) SourceDocument(de.catma.document.source.SourceDocument) AnnotationCollection(de.catma.document.annotation.AnnotationCollection) File(java.io.File) TagsetDefinitionImportStatus(de.catma.serialization.TagsetDefinitionImportStatus) TagsetImportState(de.catma.ui.module.project.documentwizard.TagsetImportState) Collections(java.util.Collections) Locale(java.util.Locale) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) ArrayList(java.util.ArrayList) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) IndexInfoSet(de.catma.document.source.IndexInfoSet) List(java.util.List) ArrayList(java.util.ArrayList) TechInfoSet(de.catma.document.source.TechInfoSet) TagLibrary(de.catma.tag.TagLibrary) AnnotationCollection(de.catma.document.annotation.AnnotationCollection) SourceDocument(de.catma.document.source.SourceDocument) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) FileInputStream(java.io.FileInputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) TagsetDefinition(de.catma.tag.TagsetDefinition) OldXMLContentHandler(de.catma.document.source.contenthandler.OldXMLContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) FileOutputStream(java.io.FileOutputStream) TagReference(de.catma.document.annotation.TagReference) StandardContentHandler(de.catma.document.source.contenthandler.StandardContentHandler) UploadFile(de.catma.ui.module.project.documentwizard.UploadFile) File(java.io.File) IDGenerator(de.catma.util.IDGenerator) TagsetImport(de.catma.ui.module.project.documentwizard.TagsetImport) TagDefinition(de.catma.tag.TagDefinition) URISyntaxException(java.net.URISyntaxException) GZIPInputStream(java.util.zip.GZIPInputStream) ContentInfoSet(de.catma.document.source.ContentInfoSet) ProjectView(de.catma.ui.module.project.ProjectView) Property(de.catma.tag.Property) AbstractSourceContentHandler(de.catma.document.source.contenthandler.AbstractSourceContentHandler) ByteArrayOutputStream(java.io.ByteArrayOutputStream) PropertyDefinition(de.catma.tag.PropertyDefinition) URISyntaxException(java.net.URISyntaxException) TagManager(de.catma.tag.TagManager) XmlMarkupCollectionSerializationHandler(de.catma.serialization.intrinsic.xml.XmlMarkupCollectionSerializationHandler) TagInstance(de.catma.tag.TagInstance) TagsetDefinitionImportStatus(de.catma.serialization.TagsetDefinitionImportStatus)

Example 4 with XML2ContentHandler

use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.

the class InspectContentStep method enter.

@Override
public void enter(boolean back) {
    if (back) {
        return;
    }
    @SuppressWarnings("unchecked") Collection<UploadFile> fileList = (Collection<UploadFile>) wizardContext.get(DocumentWizard.WizardContextKey.UPLOAD_FILE_LIST);
    contentPanel.setEnabled(false);
    progressBar.setVisible(true);
    progressBar.setIndeterminate(true);
    final ArrayList<UploadFile> files = new ArrayList<UploadFile>(fileList);
    BackgroundServiceProvider backgroundServiceProvider = (BackgroundServiceProvider) UI.getCurrent();
    backgroundServiceProvider.submit("inspecting-files", new DefaultProgressCallable<List<UploadFile>>() {

        @Override
        public List<UploadFile> call() throws Exception {
            Tika tika = new Tika();
            LanguageDetector languageDetector = LanguageDetector.getDefaultLanguageDetector();
            try {
                languageDetector.loadModels();
            } catch (IOException e) {
                ((ErrorHandler) UI.getCurrent()).showAndLogError("Error loading language detection models!", e);
            }
            for (UploadFile uploadFile : files) {
                if (uploadFile.getMimetype().equals(FileType.XML2.getMimeType())) {
                    XML2ContentHandler contentHandler = new XML2ContentHandler();
                    SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo();
                    TechInfoSet techInfoSet = new TechInfoSet(uploadFile.getOriginalFilename(), uploadFile.getMimetype(), uploadFile.getTempFilename());
                    sourceDocumentInfo.setTechInfoSet(techInfoSet);
                    contentHandler.setSourceDocumentInfo(sourceDocumentInfo);
                    contentHandler.load();
                    String content = contentHandler.getContent();
                    LanguageResult languageResult = languageDetector.detect(content);
                    if (languageResult.isReasonablyCertain() && languageResult.getLanguage() != null) {
                        uploadFile.setLanguage(new LanguageItem(new Locale(languageResult.getLanguage())));
                    }
                } else {
                    Metadata metadata = new Metadata();
                    try {
                        try (FileInputStream fis = new FileInputStream(new File(uploadFile.getTempFilename()))) {
                            String content = tika.parseToString(fis, metadata);
                            String contentType = metadata.get(Metadata.CONTENT_TYPE);
                            MediaType mediaType = MediaType.parse(contentType);
                            String charset = mediaType.getParameters().get("charset");
                            if (charset != null) {
                                uploadFile.setCharset(Charset.forName(charset));
                            }
                            LanguageResult languageResult = languageDetector.detect(content);
                            if (languageResult.isReasonablyCertain() && languageResult.getLanguage() != null) {
                                uploadFile.setLanguage(new LanguageItem(new Locale(languageResult.getLanguage())));
                            }
                        }
                    } catch (Exception e) {
                        Logger.getLogger(InspectContentStep.class.getName()).log(Level.SEVERE, String.format("Error inspecting %1$s", uploadFile.getOriginalFilename()), e);
                        String errorMsg = e.getMessage();
                        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                            errorMsg = "";
                        }
                        Notification.show("Error", String.format("Error inspecting content of %1$s! " + "Adding this file to your Project might fail!\n The underlying error message was:\n%2$s", uploadFile.getOriginalFilename(), errorMsg), Type.ERROR_MESSAGE);
                    }
                }
            }
            return files;
        }
    }, new ExecutionListener<List<UploadFile>>() {

        @Override
        public void done(List<UploadFile> result) {
            contentPanel.setEnabled(true);
            progressBar.setVisible(false);
            progressBar.setIndeterminate(false);
            fileList.clear();
            fileList.addAll(result);
            fileDataProvider.refreshAll();
            if (!fileList.isEmpty()) {
                fileList.stream().findFirst().ifPresent(uploadFile -> {
                    fileGrid.select(uploadFile);
                    updatePreview(uploadFile);
                });
            }
            if (stepChangeListener != null) {
                stepChangeListener.stepChanged(InspectContentStep.this);
            }
        }

        @Override
        public void error(Throwable t) {
            Logger.getLogger(InspectContentStep.class.getName()).log(Level.SEVERE, "Error inspecting files", t);
            String errorMsg = t.getMessage();
            if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                errorMsg = "";
            }
            Notification.show("Error", String.format("Error inspecting the contents! " + "\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
        }
    });
}

Also used : Locale(java.util.Locale) BackgroundServiceProvider(de.catma.backgroundservice.BackgroundServiceProvider) DefaultProgressCallable(de.catma.backgroundservice.DefaultProgressCallable) StepChangeListener(de.catma.ui.dialog.wizard.StepChangeListener) VerticalLayout(com.vaadin.ui.VerticalLayout) LanguageItem(de.catma.document.source.LanguageItem) ComboBox(com.vaadin.ui.ComboBox) UI(com.vaadin.ui.UI) LanguageDetector(org.apache.tika.language.detect.LanguageDetector) WizardContext(de.catma.ui.dialog.wizard.WizardContext) MediaType(org.apache.tika.mime.MediaType) ActionGridComponent(de.catma.ui.component.actiongrid.ActionGridComponent) ProgressStep(de.catma.ui.dialog.wizard.ProgressStep) TechInfoSet(de.catma.document.source.TechInfoSet) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) Binding(com.vaadin.data.Binder.Binding) Metadata(org.apache.tika.metadata.Metadata) Charset(java.nio.charset.Charset) CheckBox(com.vaadin.ui.CheckBox) Notification(com.vaadin.ui.Notification) ErrorHandler(de.catma.ui.module.main.ErrorHandler) Locale(java.util.Locale) Label(com.vaadin.ui.Label) ProgressBar(com.vaadin.ui.ProgressBar) TextArea(com.vaadin.ui.TextArea) ListDataProvider(com.vaadin.data.provider.ListDataProvider) ContentMode(com.vaadin.shared.ui.ContentMode) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) ExecutionListener(de.catma.backgroundservice.ExecutionListener) Collection(java.util.Collection) IndexInfoSet(de.catma.document.source.IndexInfoSet) ProgressStepFactory(de.catma.ui.dialog.wizard.ProgressStepFactory) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) Logger(java.util.logging.Logger) File(java.io.File) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) List(java.util.List) Type(com.vaadin.ui.Notification.Type) HorizontalLayout(com.vaadin.ui.HorizontalLayout) FileType(de.catma.document.source.FileType) Tika(org.apache.tika.Tika) WizardStep(de.catma.ui.dialog.wizard.WizardStep) Collections(java.util.Collections) LanguageResult(org.apache.tika.language.detect.LanguageResult) Grid(com.vaadin.ui.Grid) SingleOptionInputDialog(de.catma.ui.dialog.SingleOptionInputDialog) LanguageResult(org.apache.tika.language.detect.LanguageResult) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) ArrayList(java.util.ArrayList) BackgroundServiceProvider(de.catma.backgroundservice.BackgroundServiceProvider) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) MediaType(org.apache.tika.mime.MediaType) ArrayList(java.util.ArrayList) List(java.util.List) TechInfoSet(de.catma.document.source.TechInfoSet) ErrorHandler(de.catma.ui.module.main.ErrorHandler) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) IOException(java.io.IOException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) LanguageDetector(org.apache.tika.language.detect.LanguageDetector) Collection(java.util.Collection) LanguageItem(de.catma.document.source.LanguageItem) File(java.io.File)

Example 5 with XML2ContentHandler

use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.

the class InspectContentStep method updatePreview.

private void updatePreview(UploadFile uploadFile) {
    Tika tika = new Tika();
    Metadata metadata = new Metadata();
    MediaType type = MediaType.parse(uploadFile.getMimetype());
    if (type.getBaseType().toString().equals(FileType.TEXT.getMimeType())) {
        metadata.set(Metadata.CONTENT_TYPE, new MediaType(type, uploadFile.getCharset()).toString());
    }
    try {
        String content = "";
        SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo();
        IndexInfoSet indexInfoSet = new IndexInfoSet(Collections.emptyList(), Collections.emptyList(), uploadFile.getLocale());
        if (uploadFile.getMimetype().equals(FileType.XML2.getMimeType())) {
            XML2ContentHandler contentHandler = new XML2ContentHandler();
            TechInfoSet techInfoSet = new TechInfoSet(uploadFile.getOriginalFilename(), uploadFile.getMimetype(), uploadFile.getTempFilename());
            sourceDocumentInfo.setTechInfoSet(techInfoSet);
            contentHandler.setSourceDocumentInfo(sourceDocumentInfo);
            contentHandler.load();
            content = contentHandler.getContent();
        } else {
            try (FileInputStream fis = new FileInputStream(new File(uploadFile.getTempFilename()))) {
                content = tika.parseToString(fis, metadata, 3000);
            }
        }
        if (!content.isEmpty()) {
            content += " [...] ";
        }
        taPreview.setValue(content);
        if (indexInfoSet.isRightToLeftWriting()) {
            taPreview.addStyleName("document-wizard-rtl-preview");
        } else {
            taPreview.removeStyleName("document-wizard-rtl-preview");
        }
    } catch (Exception e) {
        Logger.getLogger(InspectContentStep.class.getName()).log(Level.SEVERE, String.format("Error loading preview of %1$s", uploadFile.getOriginalFilename()), e);
        String errorMsg = e.getMessage();
        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
            errorMsg = "";
        }
        Notification.show("Error", String.format("Error loading content of %1$s! " + "Adding this file to your Project might fail!\n The underlying error message was:\n%2$s", uploadFile.getOriginalFilename(), errorMsg), Type.ERROR_MESSAGE);
    }
}

Also used : SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) IndexInfoSet(de.catma.document.source.IndexInfoSet) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) Tika(org.apache.tika.Tika) TechInfoSet(de.catma.document.source.TechInfoSet) File(java.io.File) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException)

Aggregations

SourceDocumentInfo (de.catma.document.source.SourceDocumentInfo)5 XML2ContentHandler (de.catma.document.source.contenthandler.XML2ContentHandler)5 TechInfoSet (de.catma.document.source.TechInfoSet)4 File (java.io.File)4 FileInputStream (java.io.FileInputStream)4 IndexInfoSet (de.catma.document.source.IndexInfoSet)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 List (java.util.List)3 Notification (com.vaadin.ui.Notification)2 Type (com.vaadin.ui.Notification.Type)2 UI (com.vaadin.ui.UI)2 BackgroundServiceProvider (de.catma.backgroundservice.BackgroundServiceProvider)2 AnnotationCollection (de.catma.document.annotation.AnnotationCollection)2 FileType (de.catma.document.source.FileType)2 SourceDocument (de.catma.document.source.SourceDocument)2 Charset (java.nio.charset.Charset)2 Collection (java.util.Collection)2 Collections (java.util.Collections)2 Locale (java.util.Locale)2