Search in sources :

Example 1 with OldXMLContentHandler

use of de.catma.document.source.contenthandler.OldXMLContentHandler in project catma by forTEXT.

the class CorpusImporter method importCorpus.

/**
 * !BACKGROUND THREAD! No direct UI code here!
 *
 * @param progressListener
 * @param corpusFile
 * @param documentMetadataList
 * @param tempDir
 * @param ui
 * @param project
 * @return
 * @throws Exception
 */
public Void importCorpus(final ProgressListener progressListener, final File corpusFile, final List<CorpusImportDocumentMetadata> documentMetadataList, final String tempDir, final UI ui, final Project project) throws Exception {
    progressListener.setProgress("Importing Corpus");
    GZIPInputStream gzipIs = new GZIPInputStream(new FileInputStream(corpusFile));
    try (TarArchiveInputStream taIs = new TarArchiveInputStream(gzipIs)) {
        TarArchiveEntry entry = taIs.getNextTarEntry();
        while (entry != null) {
            final String entryName = entry.getName();
            final String[] pathParts = entry.getName().split(Pattern.quote("/"));
            final String documentIdPart = pathParts[2];
            final String documentId = documentIdPart.substring(documentIdPart.indexOf("__") + 3);
            final String idUri = "catma://" + documentId;
            if (pathParts[3].equals("annotationcollections")) {
                progressListener.setProgress("Importing Collection %1$s", pathParts[4]);
                ui.accessSynchronously(() -> {
                    try {
                        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
                        IOUtils.copy(taIs, buffer);
                        SourceDocument document = project.getSourceDocument(documentId);
                        Pair<AnnotationCollection, List<TagsetDefinitionImportStatus>> loadResult = project.loadAnnotationCollection(new ByteArrayInputStream(buffer.toByteArray()), document);
                        List<TagsetDefinitionImportStatus> tagsetDefinitionImportStatusList = loadResult.getSecond();
                        final AnnotationCollection annotationCollection = loadResult.getFirst();
                        Optional<TagsetDefinition> optIntrinsicTagset = annotationCollection.getTagLibrary().getTagsetDefinitions().stream().filter(tagsetDef -> tagsetDef.getName().equals("Intrinsic Markup")).findFirst();
                        if (optIntrinsicTagset.isPresent()) {
                            TagsetDefinition intrinsicTagset = optIntrinsicTagset.get();
                            List<TagReference> intrinsicAnnotations = annotationCollection.getTagReferences(intrinsicTagset);
                            if (!intrinsicAnnotations.isEmpty()) {
                                annotationCollection.removeTagReferences(intrinsicAnnotations);
                            }
                            annotationCollection.getTagLibrary().remove(intrinsicTagset);
                            tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().equals(intrinsicTagset)).findFirst().ifPresent(status -> status.setDoImport(false));
                        }
                        tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().isEmpty()).forEach(status -> status.setDoImport(false));
                        if (!annotationCollection.isEmpty()) {
                            project.importCollection(tagsetDefinitionImportStatusList, annotationCollection);
                        }
                    } catch (Exception e) {
                        Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
                        String errorMsg = e.getMessage();
                        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                            errorMsg = "";
                        }
                        Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Collection will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
                    }
                });
            } else {
                final CorpusImportDocumentMetadata documentMetadata = documentMetadataList.stream().filter(metadata -> metadata.getSourceDocID().equals(idUri)).findFirst().orElse(null);
                final Locale locale = LocaleUtils.toLocale(documentMetadata.getSourceDocLocale());
                final boolean useApostrophe = Arrays.asList(documentMetadata.getSourceDocSepChars()).contains(String.valueOf(UploadFile.APOSTROPHE));
                final String title = (documentMetadata.getSourceDocName() == null || documentMetadata.getSourceDocName().isEmpty()) ? documentId : documentMetadata.getSourceDocName();
                progressListener.setProgress("Importing Document %1$s", title);
                final File tempFile = new File(new File(tempDir), documentId);
                if (tempFile.exists()) {
                    tempFile.delete();
                }
                try (FileOutputStream fos = new FileOutputStream(tempFile)) {
                    IOUtils.copy(taIs, fos);
                }
                ui.accessSynchronously(() -> {
                    IDGenerator idGenerator = new IDGenerator();
                    IndexInfoSet indexInfoSet = new IndexInfoSet(Collections.emptyList(), useApostrophe ? Lists.newArrayList(UploadFile.APOSTROPHE) : Collections.emptyList(), locale);
                    TechInfoSet techInfoSet = new TechInfoSet(documentId, FileType.TEXT.getMimeType(), tempFile.toURI());
                    ContentInfoSet contentInfoSet = new ContentInfoSet(documentMetadata.getSourceDocAuthor(), documentMetadata.getSourceDocDescription(), documentMetadata.getSourceDocPublisher(), title);
                    techInfoSet.setCharset(Charset.forName("UTF-8"));
                    SourceDocumentInfo documentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
                    AbstractSourceContentHandler handler = null;
                    boolean loadIntrinsicMarkup = false;
                    if (entryName.endsWith("xml2")) {
                        handler = new XML2ContentHandler();
                        loadIntrinsicMarkup = true;
                    } else if (entryName.endsWith("xml")) {
                        handler = new OldXMLContentHandler();
                        loadIntrinsicMarkup = true;
                    } else {
                        handler = new StandardContentHandler();
                    }
                    handler.setSourceDocumentInfo(documentInfo);
                    SourceDocument document = new SourceDocument(documentId, handler);
                    try {
                        project.insert(document, false);
                        if (loadIntrinsicMarkup) {
                            final TagManager tagmanager = new TagManager(new TagLibrary());
                            XmlMarkupCollectionSerializationHandler markupHandler = new XmlMarkupCollectionSerializationHandler(tagmanager, (XML2ContentHandler) handler, project.getUser().getIdentifier());
                            try (FileInputStream fis = new FileInputStream(tempFile)) {
                                AnnotationCollection intrinsicMarkupCollection = markupHandler.deserialize(document, idGenerator.generateCollectionId(), fis);
                                Collection<TagsetImport> tagsetImports = new ArrayList<TagsetImport>();
                                String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
                                for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
                                    if (!tagset.isEmpty()) {
                                        TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
                                        boolean inProject = false;
                                        if (targetTagset == null) {
                                            targetTagset = tagset;
                                        } else {
                                            inProject = true;
                                        }
                                        String namespace = tagset.getName() == null ? "none" : tagset.getName();
                                        if (tagset.getName() == null) {
                                            tagset.setName(defaultIntrinsicXMLElmentsName);
                                        }
                                        TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
                                        tagsetImports.add(tagsetImport);
                                    }
                                }
                                // Creating Tagsets
                                tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_CREATED)).forEach(tagsetImport -> {
                                    if (project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid()) != null) {
                                        // already imported, so it will be a merge
                                        tagsetImport.setImportState(TagsetImportState.WILL_BE_MERGED);
                                    } else {
                                        TagsetDefinition extractedTagset = tagsetImport.getExtractedTagset();
                                        try {
                                            project.importTagsets(Collections.singletonList(new TagsetDefinitionImportStatus(extractedTagset, project.inProjectHistory(extractedTagset.getUuid()), project.getTagManager().getTagLibrary().getTagsetDefinition(extractedTagset.getUuid()) != null)));
                                        } catch (Exception e) {
                                            Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error importing tagset %1$s with ID %2$s", extractedTagset.getName(), extractedTagset.getUuid()), e);
                                            String errorMsg = e.getMessage();
                                            if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                                                errorMsg = "";
                                            }
                                            Notification.show("Error", String.format("Error importing tagset %1$s! " + "This tagset will be skipped!\n The underlying error message was:\n%2$s", extractedTagset.getName(), errorMsg), Type.ERROR_MESSAGE);
                                        }
                                    }
                                });
                                // Merging Tagsets
                                tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_MERGED)).forEach(tagsetImport -> {
                                    TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid());
                                    for (TagDefinition tag : tagsetImport.getExtractedTagset()) {
                                        Optional<TagDefinition> optionalTag = targetTagset.getTagDefinitionsByName(tag.getName()).findFirst();
                                        if (optionalTag.isPresent()) {
                                            TagDefinition existingTag = optionalTag.get();
                                            tag.getUserDefinedPropertyDefinitions().forEach(pd -> {
                                                if (existingTag.getPropertyDefinition(pd.getName()) == null) {
                                                    project.getTagManager().addUserDefinedPropertyDefinition(existingTag, new PropertyDefinition(pd));
                                                }
                                            });
                                            List<TagReference> tagReferences = intrinsicMarkupCollection.getTagReferences(tag);
                                            intrinsicMarkupCollection.removeTagReferences(tagReferences);
                                            Multimap<TagInstance, TagReference> referencesByInstance = ArrayListMultimap.create();
                                            tagReferences.forEach(tr -> referencesByInstance.put(tr.getTagInstance(), tr));
                                            for (TagInstance incomingTagInstance : referencesByInstance.keySet()) {
                                                TagInstance newTagInstance = new TagInstance(idGenerator.generate(), existingTag.getUuid(), incomingTagInstance.getAuthor(), incomingTagInstance.getTimestamp(), existingTag.getUserDefinedPropertyDefinitions(), targetTagset.getUuid());
                                                for (Property oldProp : incomingTagInstance.getUserDefinedProperties()) {
                                                    String oldPropDefId = oldProp.getPropertyDefinitionId();
                                                    PropertyDefinition oldPropDef = tag.getPropertyDefinitionByUuid(oldPropDefId);
                                                    PropertyDefinition existingPropDef = existingTag.getPropertyDefinition(oldPropDef.getName());
                                                    newTagInstance.addUserDefinedProperty(new Property(existingPropDef.getUuid(), oldProp.getPropertyValueList()));
                                                }
                                                ArrayList<TagReference> newReferences = new ArrayList<>();
                                                referencesByInstance.get(incomingTagInstance).forEach(tr -> {
                                                    try {
                                                        newReferences.add(new TagReference(newTagInstance, tr.getTarget().toString(), tr.getRange(), tr.getUserMarkupCollectionUuid()));
                                                    } catch (URISyntaxException e) {
                                                        e.printStackTrace();
                                                    }
                                                });
                                                intrinsicMarkupCollection.addTagReferences(newReferences);
                                            }
                                        } else {
                                            tag.setTagsetDefinitionUuid(targetTagset.getUuid());
                                            project.getTagManager().addTagDefinition(targetTagset, tag);
                                        }
                                    }
                                });
                                project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
                            }
                            if (tempFile.exists()) {
                                tempFile.delete();
                            }
                        }
                    } catch (Exception e) {
                        Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
                        String errorMsg = e.getMessage();
                        if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
                            errorMsg = "";
                        }
                        Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Document will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
                    }
                });
            }
            entry = taIs.getNextTarEntry();
        }
    }
    return null;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) AbstractSourceContentHandler(de.catma.document.source.contenthandler.AbstractSourceContentHandler) Arrays(java.util.Arrays) GZIPInputStream(java.util.zip.GZIPInputStream) URISyntaxException(java.net.URISyntaxException) UI(com.vaadin.ui.UI) XmlMarkupCollectionSerializationHandler(de.catma.serialization.intrinsic.xml.XmlMarkupCollectionSerializationHandler) TechInfoSet(de.catma.document.source.TechInfoSet) OldXMLContentHandler(de.catma.document.source.contenthandler.OldXMLContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Locale(java.util.Locale) TagsetImport(de.catma.ui.module.project.documentwizard.TagsetImport) Collection(java.util.Collection) IndexInfoSet(de.catma.document.source.IndexInfoSet) ProjectView(de.catma.ui.module.project.ProjectView) TagInstance(de.catma.tag.TagInstance) Logger(java.util.logging.Logger) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) TagReference(de.catma.document.annotation.TagReference) List(java.util.List) Type(com.vaadin.ui.Notification.Type) TagDefinition(de.catma.tag.TagDefinition) Optional(java.util.Optional) FileType(de.catma.document.source.FileType) Pattern(java.util.regex.Pattern) ContentInfoSet(de.catma.document.source.ContentInfoSet) PropertyDefinition(de.catma.tag.PropertyDefinition) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TagManager(de.catma.tag.TagManager) UploadFile(de.catma.ui.module.project.documentwizard.UploadFile) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) Multimap(com.google.common.collect.Multimap) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) LocaleUtils(org.apache.commons.lang3.LocaleUtils) Lists(com.google.common.collect.Lists) Charset(java.nio.charset.Charset) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) Notification(com.vaadin.ui.Notification) StandardContentHandler(de.catma.document.source.contenthandler.StandardContentHandler) TagsetDefinition(de.catma.tag.TagsetDefinition) Pair(de.catma.util.Pair) IDGenerator(de.catma.util.IDGenerator) TagLibrary(de.catma.tag.TagLibrary) ProgressListener(de.catma.backgroundservice.ProgressListener) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) Property(de.catma.tag.Property) Project(de.catma.project.Project) FileOutputStream(java.io.FileOutputStream) IOUtils(org.apache.commons.compress.utils.IOUtils) FileInputStream(java.io.FileInputStream) SourceDocument(de.catma.document.source.SourceDocument) AnnotationCollection(de.catma.document.annotation.AnnotationCollection) File(java.io.File) TagsetDefinitionImportStatus(de.catma.serialization.TagsetDefinitionImportStatus) TagsetImportState(de.catma.ui.module.project.documentwizard.TagsetImportState) Collections(java.util.Collections) Locale(java.util.Locale) SourceDocumentInfo(de.catma.document.source.SourceDocumentInfo) ArrayList(java.util.ArrayList) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) IndexInfoSet(de.catma.document.source.IndexInfoSet) List(java.util.List) ArrayList(java.util.ArrayList) TechInfoSet(de.catma.document.source.TechInfoSet) TagLibrary(de.catma.tag.TagLibrary) AnnotationCollection(de.catma.document.annotation.AnnotationCollection) SourceDocument(de.catma.document.source.SourceDocument) XML2ContentHandler(de.catma.document.source.contenthandler.XML2ContentHandler) FileInputStream(java.io.FileInputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) TagsetDefinition(de.catma.tag.TagsetDefinition) OldXMLContentHandler(de.catma.document.source.contenthandler.OldXMLContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) FileOutputStream(java.io.FileOutputStream) TagReference(de.catma.document.annotation.TagReference) StandardContentHandler(de.catma.document.source.contenthandler.StandardContentHandler) UploadFile(de.catma.ui.module.project.documentwizard.UploadFile) File(java.io.File) IDGenerator(de.catma.util.IDGenerator) TagsetImport(de.catma.ui.module.project.documentwizard.TagsetImport) TagDefinition(de.catma.tag.TagDefinition) URISyntaxException(java.net.URISyntaxException) GZIPInputStream(java.util.zip.GZIPInputStream) ContentInfoSet(de.catma.document.source.ContentInfoSet) ProjectView(de.catma.ui.module.project.ProjectView) Property(de.catma.tag.Property) AbstractSourceContentHandler(de.catma.document.source.contenthandler.AbstractSourceContentHandler) ByteArrayOutputStream(java.io.ByteArrayOutputStream) PropertyDefinition(de.catma.tag.PropertyDefinition) URISyntaxException(java.net.URISyntaxException) TagManager(de.catma.tag.TagManager) XmlMarkupCollectionSerializationHandler(de.catma.serialization.intrinsic.xml.XmlMarkupCollectionSerializationHandler) TagInstance(de.catma.tag.TagInstance) TagsetDefinitionImportStatus(de.catma.serialization.TagsetDefinitionImportStatus)

Aggregations

ArrayListMultimap (com.google.common.collect.ArrayListMultimap)1 Lists (com.google.common.collect.Lists)1 Multimap (com.google.common.collect.Multimap)1 Notification (com.vaadin.ui.Notification)1 Type (com.vaadin.ui.Notification.Type)1 UI (com.vaadin.ui.UI)1 ProgressListener (de.catma.backgroundservice.ProgressListener)1 AnnotationCollection (de.catma.document.annotation.AnnotationCollection)1 TagReference (de.catma.document.annotation.TagReference)1 ContentInfoSet (de.catma.document.source.ContentInfoSet)1 FileType (de.catma.document.source.FileType)1 IndexInfoSet (de.catma.document.source.IndexInfoSet)1 SourceDocument (de.catma.document.source.SourceDocument)1 SourceDocumentInfo (de.catma.document.source.SourceDocumentInfo)1 TechInfoSet (de.catma.document.source.TechInfoSet)1 AbstractSourceContentHandler (de.catma.document.source.contenthandler.AbstractSourceContentHandler)1 OldXMLContentHandler (de.catma.document.source.contenthandler.OldXMLContentHandler)1 StandardContentHandler (de.catma.document.source.contenthandler.StandardContentHandler)1 XML2ContentHandler (de.catma.document.source.contenthandler.XML2ContentHandler)1 Project (de.catma.project.Project)1