use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.
the class ProjectView method addUploadFile.
private void addUploadFile(UploadFile uploadFile, boolean useApostropheAsSeparator, String collectionNamePattern) {
SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(uploadFile.getIndexInfoSet(useApostropheAsSeparator), uploadFile.getContentInfoSet(), uploadFile.getTechInfoSet());
SourceContentHandler contentHandler = sourceDocumentInfo.getTechInfoSet().getMimeType().equals(FileType.XML2.getMimeType()) ? new XML2ContentHandler() : new TikaContentHandler();
contentHandler.setSourceDocumentInfo(sourceDocumentInfo);
SourceDocument document = new SourceDocument(uploadFile.getUuid(), contentHandler);
try {
String content = document.getContent();
FileOSType fileOSType = FileOSType.getFileOSType(content);
sourceDocumentInfo.getTechInfoSet().setFileOSType(fileOSType);
CRC32 checksum = new CRC32();
checksum.update(content.getBytes());
sourceDocumentInfo.getTechInfoSet().setChecksum(checksum.getValue());
project.insert(document);
AnnotationCollection intrinsicMarkupCollection = uploadFile.getIntrinsicMarkupCollection();
if (intrinsicMarkupCollection != null) {
project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
}
if (collectionNamePattern != null && !collectionNamePattern.isEmpty()) {
String collectionName = collectionNamePattern.replace("{{Title}}", uploadFile.getTitle());
project.createUserMarkupCollection(collectionName, document);
}
} catch (IOException e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error loading content of %1$s", uploadFile.getTempFilename().toString()), e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error loading content of %1$s! " + "This document will be skipped!\n The underlying error message was:\n%2$s", uploadFile.getTitle(), errorMsg), Type.ERROR_MESSAGE);
}
}
use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.
the class ImportIntrinsicMarkupStep method enter.
@Override
public void enter(boolean back) {
if (back) {
return;
}
contentPanel.setEnabled(false);
progressBar.setVisible(true);
progressBar.setIndeterminate(true);
@SuppressWarnings("unchecked") final ArrayList<UploadFile> files = new ArrayList<UploadFile>(((Collection<UploadFile>) wizardContext.get(DocumentWizard.WizardContextKey.UPLOAD_FILE_LIST)).stream().filter(uploadFile -> uploadFile.getMimetype().equals(FileType.XML2.getMimeType())).collect(Collectors.toList()));
final TagManager tagmanager = new TagManager(new TagLibrary());
BackgroundServiceProvider backgroundServiceProvider = (BackgroundServiceProvider) UI.getCurrent();
backgroundServiceProvider.submit("inspecting-intrinsic-markup", new DefaultProgressCallable<List<UploadFile>>() {
@Override
public List<UploadFile> call() throws Exception {
IDGenerator idGenerator = new IDGenerator();
for (UploadFile uploadFile : files) {
XML2ContentHandler contentHandler = new XML2ContentHandler();
SourceDocument doc = new SourceDocument(uploadFile.getUuid(), contentHandler);
SourceDocumentInfo documentInfo = new SourceDocumentInfo();
TechInfoSet techInfoSet = new TechInfoSet();
techInfoSet.setURI(uploadFile.getTempFilename());
documentInfo.setTechInfoSet(techInfoSet);
contentHandler.setSourceDocumentInfo(documentInfo);
XmlMarkupCollectionSerializationHandler handler = new XmlMarkupCollectionSerializationHandler(tagmanager, contentHandler, project.getUser().getIdentifier());
try (FileInputStream fis = new FileInputStream(new File(uploadFile.getTempFilename()))) {
AnnotationCollection collection = handler.deserialize(doc, idGenerator.generateCollectionId(), fis);
uploadFile.setIntrinsicMarkupCollection(collection);
}
}
return files;
}
}, new ExecutionListener<List<UploadFile>>() {
@Override
public void done(List<UploadFile> result) {
contentPanel.setEnabled(true);
progressBar.setVisible(false);
progressBar.setIndeterminate(false);
fileList.clear();
fileList.addAll(result);
fileDataProvider.refreshAll();
tagsetImportList.clear();
String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
if (!tagset.isEmpty()) {
TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
boolean inProject = false;
if (targetTagset == null) {
targetTagset = tagset;
} else {
inProject = true;
}
String namespace = tagset.getName() == null ? "none" : tagset.getName();
if (tagset.getName() == null) {
tagset.setName(defaultIntrinsicXMLElmentsName);
}
TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
tagsetImportList.add(tagsetImport);
}
}
tagsetDataProvider.refreshAll();
wizardContext.put(DocumentWizard.WizardContextKey.TAGSET_IMPORT_LIST, tagsetImportList);
if (stepChangeListener != null) {
stepChangeListener.stepChanged(ImportIntrinsicMarkupStep.this);
}
}
@Override
public void error(Throwable t) {
Logger.getLogger(ImportIntrinsicMarkupStep.class.getName()).log(Level.SEVERE, "Error inspecting files", t);
String errorMsg = t.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error inspecting the contents! " + "\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
}
});
}
use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.
the class CorpusImporter method importCorpus.
/**
* !BACKGROUND THREAD! No direct UI code here!
*
* @param progressListener
* @param corpusFile
* @param documentMetadataList
* @param tempDir
* @param ui
* @param project
* @return
* @throws Exception
*/
public Void importCorpus(final ProgressListener progressListener, final File corpusFile, final List<CorpusImportDocumentMetadata> documentMetadataList, final String tempDir, final UI ui, final Project project) throws Exception {
progressListener.setProgress("Importing Corpus");
GZIPInputStream gzipIs = new GZIPInputStream(new FileInputStream(corpusFile));
try (TarArchiveInputStream taIs = new TarArchiveInputStream(gzipIs)) {
TarArchiveEntry entry = taIs.getNextTarEntry();
while (entry != null) {
final String entryName = entry.getName();
final String[] pathParts = entry.getName().split(Pattern.quote("/"));
final String documentIdPart = pathParts[2];
final String documentId = documentIdPart.substring(documentIdPart.indexOf("__") + 3);
final String idUri = "catma://" + documentId;
if (pathParts[3].equals("annotationcollections")) {
progressListener.setProgress("Importing Collection %1$s", pathParts[4]);
ui.accessSynchronously(() -> {
try {
final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
IOUtils.copy(taIs, buffer);
SourceDocument document = project.getSourceDocument(documentId);
Pair<AnnotationCollection, List<TagsetDefinitionImportStatus>> loadResult = project.loadAnnotationCollection(new ByteArrayInputStream(buffer.toByteArray()), document);
List<TagsetDefinitionImportStatus> tagsetDefinitionImportStatusList = loadResult.getSecond();
final AnnotationCollection annotationCollection = loadResult.getFirst();
Optional<TagsetDefinition> optIntrinsicTagset = annotationCollection.getTagLibrary().getTagsetDefinitions().stream().filter(tagsetDef -> tagsetDef.getName().equals("Intrinsic Markup")).findFirst();
if (optIntrinsicTagset.isPresent()) {
TagsetDefinition intrinsicTagset = optIntrinsicTagset.get();
List<TagReference> intrinsicAnnotations = annotationCollection.getTagReferences(intrinsicTagset);
if (!intrinsicAnnotations.isEmpty()) {
annotationCollection.removeTagReferences(intrinsicAnnotations);
}
annotationCollection.getTagLibrary().remove(intrinsicTagset);
tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().equals(intrinsicTagset)).findFirst().ifPresent(status -> status.setDoImport(false));
}
tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().isEmpty()).forEach(status -> status.setDoImport(false));
if (!annotationCollection.isEmpty()) {
project.importCollection(tagsetDefinitionImportStatusList, annotationCollection);
}
} catch (Exception e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Collection will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
}
});
} else {
final CorpusImportDocumentMetadata documentMetadata = documentMetadataList.stream().filter(metadata -> metadata.getSourceDocID().equals(idUri)).findFirst().orElse(null);
final Locale locale = LocaleUtils.toLocale(documentMetadata.getSourceDocLocale());
final boolean useApostrophe = Arrays.asList(documentMetadata.getSourceDocSepChars()).contains(String.valueOf(UploadFile.APOSTROPHE));
final String title = (documentMetadata.getSourceDocName() == null || documentMetadata.getSourceDocName().isEmpty()) ? documentId : documentMetadata.getSourceDocName();
progressListener.setProgress("Importing Document %1$s", title);
final File tempFile = new File(new File(tempDir), documentId);
if (tempFile.exists()) {
tempFile.delete();
}
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
IOUtils.copy(taIs, fos);
}
ui.accessSynchronously(() -> {
IDGenerator idGenerator = new IDGenerator();
IndexInfoSet indexInfoSet = new IndexInfoSet(Collections.emptyList(), useApostrophe ? Lists.newArrayList(UploadFile.APOSTROPHE) : Collections.emptyList(), locale);
TechInfoSet techInfoSet = new TechInfoSet(documentId, FileType.TEXT.getMimeType(), tempFile.toURI());
ContentInfoSet contentInfoSet = new ContentInfoSet(documentMetadata.getSourceDocAuthor(), documentMetadata.getSourceDocDescription(), documentMetadata.getSourceDocPublisher(), title);
techInfoSet.setCharset(Charset.forName("UTF-8"));
SourceDocumentInfo documentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
AbstractSourceContentHandler handler = null;
boolean loadIntrinsicMarkup = false;
if (entryName.endsWith("xml2")) {
handler = new XML2ContentHandler();
loadIntrinsicMarkup = true;
} else if (entryName.endsWith("xml")) {
handler = new OldXMLContentHandler();
loadIntrinsicMarkup = true;
} else {
handler = new StandardContentHandler();
}
handler.setSourceDocumentInfo(documentInfo);
SourceDocument document = new SourceDocument(documentId, handler);
try {
project.insert(document, false);
if (loadIntrinsicMarkup) {
final TagManager tagmanager = new TagManager(new TagLibrary());
XmlMarkupCollectionSerializationHandler markupHandler = new XmlMarkupCollectionSerializationHandler(tagmanager, (XML2ContentHandler) handler, project.getUser().getIdentifier());
try (FileInputStream fis = new FileInputStream(tempFile)) {
AnnotationCollection intrinsicMarkupCollection = markupHandler.deserialize(document, idGenerator.generateCollectionId(), fis);
Collection<TagsetImport> tagsetImports = new ArrayList<TagsetImport>();
String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
if (!tagset.isEmpty()) {
TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
boolean inProject = false;
if (targetTagset == null) {
targetTagset = tagset;
} else {
inProject = true;
}
String namespace = tagset.getName() == null ? "none" : tagset.getName();
if (tagset.getName() == null) {
tagset.setName(defaultIntrinsicXMLElmentsName);
}
TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
tagsetImports.add(tagsetImport);
}
}
// Creating Tagsets
tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_CREATED)).forEach(tagsetImport -> {
if (project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid()) != null) {
// already imported, so it will be a merge
tagsetImport.setImportState(TagsetImportState.WILL_BE_MERGED);
} else {
TagsetDefinition extractedTagset = tagsetImport.getExtractedTagset();
try {
project.importTagsets(Collections.singletonList(new TagsetDefinitionImportStatus(extractedTagset, project.inProjectHistory(extractedTagset.getUuid()), project.getTagManager().getTagLibrary().getTagsetDefinition(extractedTagset.getUuid()) != null)));
} catch (Exception e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error importing tagset %1$s with ID %2$s", extractedTagset.getName(), extractedTagset.getUuid()), e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error importing tagset %1$s! " + "This tagset will be skipped!\n The underlying error message was:\n%2$s", extractedTagset.getName(), errorMsg), Type.ERROR_MESSAGE);
}
}
});
// Merging Tagsets
tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_MERGED)).forEach(tagsetImport -> {
TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid());
for (TagDefinition tag : tagsetImport.getExtractedTagset()) {
Optional<TagDefinition> optionalTag = targetTagset.getTagDefinitionsByName(tag.getName()).findFirst();
if (optionalTag.isPresent()) {
TagDefinition existingTag = optionalTag.get();
tag.getUserDefinedPropertyDefinitions().forEach(pd -> {
if (existingTag.getPropertyDefinition(pd.getName()) == null) {
project.getTagManager().addUserDefinedPropertyDefinition(existingTag, new PropertyDefinition(pd));
}
});
List<TagReference> tagReferences = intrinsicMarkupCollection.getTagReferences(tag);
intrinsicMarkupCollection.removeTagReferences(tagReferences);
Multimap<TagInstance, TagReference> referencesByInstance = ArrayListMultimap.create();
tagReferences.forEach(tr -> referencesByInstance.put(tr.getTagInstance(), tr));
for (TagInstance incomingTagInstance : referencesByInstance.keySet()) {
TagInstance newTagInstance = new TagInstance(idGenerator.generate(), existingTag.getUuid(), incomingTagInstance.getAuthor(), incomingTagInstance.getTimestamp(), existingTag.getUserDefinedPropertyDefinitions(), targetTagset.getUuid());
for (Property oldProp : incomingTagInstance.getUserDefinedProperties()) {
String oldPropDefId = oldProp.getPropertyDefinitionId();
PropertyDefinition oldPropDef = tag.getPropertyDefinitionByUuid(oldPropDefId);
PropertyDefinition existingPropDef = existingTag.getPropertyDefinition(oldPropDef.getName());
newTagInstance.addUserDefinedProperty(new Property(existingPropDef.getUuid(), oldProp.getPropertyValueList()));
}
ArrayList<TagReference> newReferences = new ArrayList<>();
referencesByInstance.get(incomingTagInstance).forEach(tr -> {
try {
newReferences.add(new TagReference(newTagInstance, tr.getTarget().toString(), tr.getRange(), tr.getUserMarkupCollectionUuid()));
} catch (URISyntaxException e) {
e.printStackTrace();
}
});
intrinsicMarkupCollection.addTagReferences(newReferences);
}
} else {
tag.setTagsetDefinitionUuid(targetTagset.getUuid());
project.getTagManager().addTagDefinition(targetTagset, tag);
}
}
});
project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
}
if (tempFile.exists()) {
tempFile.delete();
}
}
} catch (Exception e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Document will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
}
});
}
entry = taIs.getNextTarEntry();
}
}
return null;
}
use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.
the class InspectContentStep method enter.
@Override
public void enter(boolean back) {
if (back) {
return;
}
@SuppressWarnings("unchecked") Collection<UploadFile> fileList = (Collection<UploadFile>) wizardContext.get(DocumentWizard.WizardContextKey.UPLOAD_FILE_LIST);
contentPanel.setEnabled(false);
progressBar.setVisible(true);
progressBar.setIndeterminate(true);
final ArrayList<UploadFile> files = new ArrayList<UploadFile>(fileList);
BackgroundServiceProvider backgroundServiceProvider = (BackgroundServiceProvider) UI.getCurrent();
backgroundServiceProvider.submit("inspecting-files", new DefaultProgressCallable<List<UploadFile>>() {
@Override
public List<UploadFile> call() throws Exception {
Tika tika = new Tika();
LanguageDetector languageDetector = LanguageDetector.getDefaultLanguageDetector();
try {
languageDetector.loadModels();
} catch (IOException e) {
((ErrorHandler) UI.getCurrent()).showAndLogError("Error loading language detection models!", e);
}
for (UploadFile uploadFile : files) {
if (uploadFile.getMimetype().equals(FileType.XML2.getMimeType())) {
XML2ContentHandler contentHandler = new XML2ContentHandler();
SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo();
TechInfoSet techInfoSet = new TechInfoSet(uploadFile.getOriginalFilename(), uploadFile.getMimetype(), uploadFile.getTempFilename());
sourceDocumentInfo.setTechInfoSet(techInfoSet);
contentHandler.setSourceDocumentInfo(sourceDocumentInfo);
contentHandler.load();
String content = contentHandler.getContent();
LanguageResult languageResult = languageDetector.detect(content);
if (languageResult.isReasonablyCertain() && languageResult.getLanguage() != null) {
uploadFile.setLanguage(new LanguageItem(new Locale(languageResult.getLanguage())));
}
} else {
Metadata metadata = new Metadata();
try {
try (FileInputStream fis = new FileInputStream(new File(uploadFile.getTempFilename()))) {
String content = tika.parseToString(fis, metadata);
String contentType = metadata.get(Metadata.CONTENT_TYPE);
MediaType mediaType = MediaType.parse(contentType);
String charset = mediaType.getParameters().get("charset");
if (charset != null) {
uploadFile.setCharset(Charset.forName(charset));
}
LanguageResult languageResult = languageDetector.detect(content);
if (languageResult.isReasonablyCertain() && languageResult.getLanguage() != null) {
uploadFile.setLanguage(new LanguageItem(new Locale(languageResult.getLanguage())));
}
}
} catch (Exception e) {
Logger.getLogger(InspectContentStep.class.getName()).log(Level.SEVERE, String.format("Error inspecting %1$s", uploadFile.getOriginalFilename()), e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error inspecting content of %1$s! " + "Adding this file to your Project might fail!\n The underlying error message was:\n%2$s", uploadFile.getOriginalFilename(), errorMsg), Type.ERROR_MESSAGE);
}
}
}
return files;
}
}, new ExecutionListener<List<UploadFile>>() {
@Override
public void done(List<UploadFile> result) {
contentPanel.setEnabled(true);
progressBar.setVisible(false);
progressBar.setIndeterminate(false);
fileList.clear();
fileList.addAll(result);
fileDataProvider.refreshAll();
if (!fileList.isEmpty()) {
fileList.stream().findFirst().ifPresent(uploadFile -> {
fileGrid.select(uploadFile);
updatePreview(uploadFile);
});
}
if (stepChangeListener != null) {
stepChangeListener.stepChanged(InspectContentStep.this);
}
}
@Override
public void error(Throwable t) {
Logger.getLogger(InspectContentStep.class.getName()).log(Level.SEVERE, "Error inspecting files", t);
String errorMsg = t.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error inspecting the contents! " + "\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
}
});
}
use of de.catma.document.source.contenthandler.XML2ContentHandler in project catma by forTEXT.
the class InspectContentStep method updatePreview.
private void updatePreview(UploadFile uploadFile) {
Tika tika = new Tika();
Metadata metadata = new Metadata();
MediaType type = MediaType.parse(uploadFile.getMimetype());
if (type.getBaseType().toString().equals(FileType.TEXT.getMimeType())) {
metadata.set(Metadata.CONTENT_TYPE, new MediaType(type, uploadFile.getCharset()).toString());
}
try {
String content = "";
SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo();
IndexInfoSet indexInfoSet = new IndexInfoSet(Collections.emptyList(), Collections.emptyList(), uploadFile.getLocale());
if (uploadFile.getMimetype().equals(FileType.XML2.getMimeType())) {
XML2ContentHandler contentHandler = new XML2ContentHandler();
TechInfoSet techInfoSet = new TechInfoSet(uploadFile.getOriginalFilename(), uploadFile.getMimetype(), uploadFile.getTempFilename());
sourceDocumentInfo.setTechInfoSet(techInfoSet);
contentHandler.setSourceDocumentInfo(sourceDocumentInfo);
contentHandler.load();
content = contentHandler.getContent();
} else {
try (FileInputStream fis = new FileInputStream(new File(uploadFile.getTempFilename()))) {
content = tika.parseToString(fis, metadata, 3000);
}
}
if (!content.isEmpty()) {
content += " [...] ";
}
taPreview.setValue(content);
if (indexInfoSet.isRightToLeftWriting()) {
taPreview.addStyleName("document-wizard-rtl-preview");
} else {
taPreview.removeStyleName("document-wizard-rtl-preview");
}
} catch (Exception e) {
Logger.getLogger(InspectContentStep.class.getName()).log(Level.SEVERE, String.format("Error loading preview of %1$s", uploadFile.getOriginalFilename()), e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error loading content of %1$s! " + "Adding this file to your Project might fail!\n The underlying error message was:\n%2$s", uploadFile.getOriginalFilename(), errorMsg), Type.ERROR_MESSAGE);
}
}
Aggregations