use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.
the class GitSourceDocumentHandlerTest method create.
@Test
public void create() throws Exception {
File originalSourceDocument = new File("testdocs/rose_for_emily.pdf");
File convertedSourceDocument = new File("testdocs/rose_for_emily.txt");
FileInputStream originalSourceDocumentStream = new FileInputStream(originalSourceDocument);
FileInputStream convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
IndexInfoSet indexInfoSet = new IndexInfoSet();
indexInfoSet.setLocale(Locale.ENGLISH);
ContentInfoSet contentInfoSet = new ContentInfoSet("William Faulkner", "", "", "A Rose for Emily");
TechInfoSet techInfoSet = new TechInfoSet(FileType.TEXT, StandardCharsets.UTF_8, FileOSType.DOS, 705211438L);
SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
Map<String, List<TermInfo>> terms = new TermExtractor(IOUtils.toString(convertedSourceDocumentStream, techInfoSet.getCharset()), new ArrayList<>(), new ArrayList<>(), indexInfoSet.getLocale()).getTerms();
// need to re-instantiate the stream, otherwise an empty file will be written later on (FileInputStream does not support `reset`)
convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
String sourceDocumentUuid = new IDGenerator().generateDocumentId();
// GraphWorktreeProject.TOKENIZED_FILE_EXTENSION
String tokenizedSourceDocumentFileName = sourceDocumentUuid + "." + "json";
try (ILocalGitRepositoryManager jGitRepoManager = new JGitRepoManager(CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(), gitlabManagerRestricted.getUser())) {
directoriesToDeleteOnTearDown.add(jGitRepoManager.getRepositoryBasePath());
BackgroundService mockBackgroundService = mock(BackgroundService.class);
EventBus mockEventBus = mock(EventBus.class);
GitProjectManager gitProjectManager = new GitProjectManager(CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(), gitlabManagerRestricted, // noop deletion handler
(projectId) -> {
}, mockBackgroundService, mockEventBus);
String projectId = gitProjectManager.create("Test CATMA Project", "This is a test CATMA project");
// we don't add the projectId to projectsToDeleteOnTearDown as deletion of the user will take care of that for us
// the JGitRepoManager instance should always be in a detached state after GitProjectManager calls return
assertFalse(jGitRepoManager.isAttached());
GitSourceDocumentHandler gitSourceDocumentHandler = new GitSourceDocumentHandler(jGitRepoManager, gitlabManagerRestricted, new UsernamePasswordCredentialsProvider("oauth2", gitlabManagerRestricted.getPassword()));
String revisionHash = gitSourceDocumentHandler.create(projectId, sourceDocumentUuid, originalSourceDocumentStream, originalSourceDocument.getName(), convertedSourceDocumentStream, convertedSourceDocument.getName(), terms, tokenizedSourceDocumentFileName, sourceDocumentInfo);
assertNotNull(revisionHash);
// the JGitRepoManager instance should always be in a detached state after GitSourceDocumentHandler calls return
assertFalse(jGitRepoManager.isAttached());
File expectedRepoPath = Paths.get(jGitRepoManager.getRepositoryBasePath().getPath(), projectId, sourceDocumentUuid).toFile();
assert expectedRepoPath.exists();
assert expectedRepoPath.isDirectory();
assert Arrays.asList(expectedRepoPath.list()).contains("rose_for_emily.pdf");
assert Arrays.asList(expectedRepoPath.list()).contains("rose_for_emily.txt");
assert FileUtils.contentEquals(originalSourceDocument, new File(expectedRepoPath, "rose_for_emily.pdf"));
assert FileUtils.contentEquals(convertedSourceDocument, new File(expectedRepoPath, "rose_for_emily.txt"));
assert Arrays.asList(expectedRepoPath.list()).contains("header.json");
String expectedSerializedSourceDocumentInfo = "" + "{\n" + " \"gitContentInfoSet\": {\n" + " \"author\": \"William Faulkner\",\n" + " \"description\": \"\",\n" + " \"publisher\": \"\",\n" + " \"title\": \"A Rose for Emily\"\n" + " },\n" + " \"gitIndexInfoSet\": {\n" + " \"locale\": \"en\",\n" + " \"unseparableCharacterSequences\": [],\n" + " \"userDefinedSeparatingCharacters\": []\n" + " },\n" + " \"gitTechInfoSet\": {\n" + " \"charset\": \"UTF-8\",\n" + " \"checksum\": 705211438,\n" + " \"fileName\": null,\n" + " \"fileOSType\": \"DOS\",\n" + " \"fileType\": \"TEXT\",\n" + " \"mimeType\": \"text/plain\",\n" + " \"uri\": null\n" + " }\n" + "}";
assertEquals(expectedSerializedSourceDocumentInfo, FileUtils.readFileToString(new File(expectedRepoPath, "header.json"), StandardCharsets.UTF_8));
}
}
use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.
the class JsonLdWebAnnotationTest method getJsonLdWebAnnotation.
/**
* @return a HashMap<String, Object> with these keys:
* 'jsonLdWebAnnotation' - for the JsonLdWebAnnotation object
* 'projectUuid'
* --- following additional keys which are to be used when formatting EXPECTED_SERIALIZED_ANNOTATION ---:
* projectRootRepositoryName, tagsetDefinitionUuid, tagDefinitionUuid, userPropertyDefinitionUuid,
* systemPropertyDefinitionUuid, userMarkupCollectionUuid, tagInstanceUuid, sourceDocumentUuid
*/
public static HashMap<String, Object> getJsonLdWebAnnotation(JGitRepoManager jGitRepoManager, IRemoteGitServerManager gitLabServerManager, de.catma.user.User catmaUser) throws Exception {
try (JGitRepoManager localJGitRepoManager = jGitRepoManager) {
// caller should do the following:
// this.directoriesToDeleteOnTearDown.add(localJGitRepoManager.getRepositoryBasePath());
// create project
GitProjectManager gitProjectManager = new GitProjectManager(RepositoryPropertyKey.GitBasedRepositoryBasePath.getValue(), UserIdentification.userToMap(catmaUser.getIdentifier()));
String projectId = gitProjectManager.create("Test CATMA Project", "This is a test CATMA project");
// caller should do the following:
// this.projectsToDeleteOnTearDown.add(projectId);
GitProjectHandler gitProjectHandler = new GitProjectHandler(null, projectId, jGitRepoManager, gitLabServerManager);
// add new tagset to project
String tagsetId = gitProjectHandler.createTagset(null, "Test Tagset", null);
// add new source document to project
File originalSourceDocument = new File("testdocs/rose_for_emily.pdf");
File convertedSourceDocument = new File("testdocs/rose_for_emily.txt");
FileInputStream originalSourceDocumentStream = new FileInputStream(originalSourceDocument);
FileInputStream convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
IndexInfoSet indexInfoSet = new IndexInfoSet();
indexInfoSet.setLocale(Locale.ENGLISH);
ContentInfoSet contentInfoSet = new ContentInfoSet("William Faulkner", "", "", "A Rose for Emily");
TechInfoSet techInfoSet = new TechInfoSet(FileType.TEXT, StandardCharsets.UTF_8, FileOSType.DOS, 705211438L);
SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
String sourceDocumentId = gitProjectHandler.createSourceDocument(null, originalSourceDocumentStream, originalSourceDocument.getName(), convertedSourceDocumentStream, convertedSourceDocument.getName(), null, null, sourceDocumentInfo);
// add new markup collection to project
String markupCollectionId = gitProjectHandler.createMarkupCollection(null, "Test Markup Collection", null, sourceDocumentId, "fakeSourceDocumentVersion");
// commit the changes to the project root repo (addition of tagset, source document and markup collection
// submodules)
String projectRootRepositoryName = GitProjectManager.getProjectRootRepositoryName(projectId);
localJGitRepoManager.open(projectId, projectRootRepositoryName);
localJGitRepoManager.commit(String.format("Adding new tagset %s, source document %s and markup collection %s", tagsetId, sourceDocumentId, markupCollectionId), "Test Committer", "testcommitter@catma.de");
// can't call open on an attached instance
localJGitRepoManager.detach();
// construct TagDefinition object
IDGenerator idGenerator = new IDGenerator();
List<String> systemPropertyPossibleValues = Arrays.asList("SYSPROP_VAL_1", "SYSPROP_VAL_2");
PropertyDefinition systemPropertyDefinition = new PropertyDefinition(PropertyDefinition.SystemPropertyName.catma_displaycolor.toString(), systemPropertyPossibleValues);
List<String> userPropertyPossibleValues = Arrays.asList("UPROP_VAL_1", "UPROP_VAL_2");
PropertyDefinition userPropertyDefinition = new PropertyDefinition("UPROP_DEF", userPropertyPossibleValues);
String tagDefinitionUuid = idGenerator.generate();
TagDefinition tagDefinition = new TagDefinition(null, tagDefinitionUuid, "TAG_DEF", new Version(), null, null, tagsetId);
tagDefinition.addSystemPropertyDefinition(systemPropertyDefinition);
tagDefinition.addUserDefinedPropertyDefinition(userPropertyDefinition);
// call createTagDefinition
// NB: in this case we know that the tagset submodule is on the master branch tip, ie: not in a detached
// head state, so it's safe to make changes to the submodule and commit them
// TODO: createTagDefinition should probably do some validation and fail fast if the tagset submodule is in
// a detached head state - in that case the submodule would need to be updated first
// see the "Updating a submodule in-place in the container" scenario at
// https://medium.com/@porteneuve/mastering-git-submodules-34c65e940407
GitTagsetHandler gitTagsetHandler = new GitTagsetHandler(localJGitRepoManager, gitLabServerManager);
String returnedTagDefinitionId = gitTagsetHandler.createOrUpdateTagDefinition(projectId, tagsetId, tagDefinition);
assertNotNull(returnedTagDefinitionId);
assert returnedTagDefinitionId.startsWith("CATMA_");
// the JGitRepoManager instance should always be in a detached state after GitTagsetHandler calls return
assertFalse(localJGitRepoManager.isAttached());
assertEquals(tagDefinitionUuid, returnedTagDefinitionId);
// commit and push submodule changes (creation of tag definition)
// TODO: add methods to JGitRepoManager to do this
localJGitRepoManager.open(projectId, projectRootRepositoryName);
Repository projectRootRepository = localJGitRepoManager.getGitApi().getRepository();
String tagsetSubmodulePath = String.format("%s/%s", GitProjectHandler.TAGSET_SUBMODULES_DIRECTORY_NAME, tagsetId);
Repository tagsetSubmoduleRepository = SubmoduleWalk.getSubmoduleRepository(projectRootRepository, tagsetSubmodulePath);
Git submoduleGit = new Git(tagsetSubmoduleRepository);
submoduleGit.add().addFilepattern(tagDefinitionUuid).call();
submoduleGit.commit().setMessage(String.format("Adding tag definition %s", tagDefinitionUuid)).setCommitter("Test Committer", "testcommitter@catma.de").call();
submoduleGit.push().setCredentialsProvider(new UsernamePasswordCredentialsProvider(gitLabServerManager.getUsername(), gitLabServerManager.getPassword())).call();
tagsetSubmoduleRepository.close();
submoduleGit.close();
// commit and push project root repo changes (update of tagset submodule)
localJGitRepoManager.getGitApi().add().addFilepattern(tagsetSubmodulePath).call();
localJGitRepoManager.commit(String.format("Updating tagset %s", tagsetId), "Test Committer", "testcommitter@catma.de");
// construct TagInstance object
Property systemProperty = new Property(systemPropertyDefinition, Collections.singleton("SYSPROP_VAL_1"));
Property userProperty = new Property(userPropertyDefinition, Collections.singleton("UPROP_VAL_2"));
String tagInstanceUuid = idGenerator.generate();
TagInstance tagInstance = new TagInstance(tagInstanceUuid, tagDefinition);
tagInstance.addSystemProperty(systemProperty);
tagInstance.addUserDefinedProperty(userProperty);
// construct JsonLdWebAnnotation object
String sourceDocumentUri = String.format("http://catma.de/gitlab/%s/%s/%s", projectRootRepositoryName, GitProjectHandler.SOURCE_DOCUMENT_SUBMODULES_DIRECTORY_NAME, sourceDocumentId);
Range range1 = new Range(12, 18);
Range range2 = new Range(41, 47);
List<TagReference> tagReferences = new ArrayList<>(Arrays.asList(new TagReference(tagInstance, sourceDocumentUri, range1, markupCollectionId), new TagReference(tagInstance, sourceDocumentUri, range2, markupCollectionId)));
JsonLdWebAnnotation jsonLdWebAnnotation = new JsonLdWebAnnotation("http://catma.de/gitlab", projectId, tagReferences);
HashMap<String, Object> returnValue = new HashMap<>();
returnValue.put("jsonLdWebAnnotation", jsonLdWebAnnotation);
returnValue.put("projectRootRepositoryName", projectRootRepositoryName);
returnValue.put("projectUuid", projectId);
returnValue.put("tagsetDefinitionUuid", tagsetId);
returnValue.put("tagDefinitionUuid", tagDefinitionUuid);
returnValue.put("userMarkupCollectionUuid", markupCollectionId);
returnValue.put("tagInstanceUuid", tagInstanceUuid);
returnValue.put("sourceDocumentUuid", sourceDocumentId);
return returnValue;
}
}
use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.
the class CorpusImporter method importCorpus.
/**
* !BACKGROUND THREAD! No direct UI code here!
*
* @param progressListener
* @param corpusFile
* @param documentMetadataList
* @param tempDir
* @param ui
* @param project
* @return
* @throws Exception
*/
public Void importCorpus(final ProgressListener progressListener, final File corpusFile, final List<CorpusImportDocumentMetadata> documentMetadataList, final String tempDir, final UI ui, final Project project) throws Exception {
progressListener.setProgress("Importing Corpus");
GZIPInputStream gzipIs = new GZIPInputStream(new FileInputStream(corpusFile));
try (TarArchiveInputStream taIs = new TarArchiveInputStream(gzipIs)) {
TarArchiveEntry entry = taIs.getNextTarEntry();
while (entry != null) {
final String entryName = entry.getName();
final String[] pathParts = entry.getName().split(Pattern.quote("/"));
final String documentIdPart = pathParts[2];
final String documentId = documentIdPart.substring(documentIdPart.indexOf("__") + 3);
final String idUri = "catma://" + documentId;
if (pathParts[3].equals("annotationcollections")) {
progressListener.setProgress("Importing Collection %1$s", pathParts[4]);
ui.accessSynchronously(() -> {
try {
final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
IOUtils.copy(taIs, buffer);
SourceDocument document = project.getSourceDocument(documentId);
Pair<AnnotationCollection, List<TagsetDefinitionImportStatus>> loadResult = project.loadAnnotationCollection(new ByteArrayInputStream(buffer.toByteArray()), document);
List<TagsetDefinitionImportStatus> tagsetDefinitionImportStatusList = loadResult.getSecond();
final AnnotationCollection annotationCollection = loadResult.getFirst();
Optional<TagsetDefinition> optIntrinsicTagset = annotationCollection.getTagLibrary().getTagsetDefinitions().stream().filter(tagsetDef -> tagsetDef.getName().equals("Intrinsic Markup")).findFirst();
if (optIntrinsicTagset.isPresent()) {
TagsetDefinition intrinsicTagset = optIntrinsicTagset.get();
List<TagReference> intrinsicAnnotations = annotationCollection.getTagReferences(intrinsicTagset);
if (!intrinsicAnnotations.isEmpty()) {
annotationCollection.removeTagReferences(intrinsicAnnotations);
}
annotationCollection.getTagLibrary().remove(intrinsicTagset);
tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().equals(intrinsicTagset)).findFirst().ifPresent(status -> status.setDoImport(false));
}
tagsetDefinitionImportStatusList.stream().filter(status -> status.getTagset().isEmpty()).forEach(status -> status.setDoImport(false));
if (!annotationCollection.isEmpty()) {
project.importCollection(tagsetDefinitionImportStatusList, annotationCollection);
}
} catch (Exception e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Collection will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
}
});
} else {
final CorpusImportDocumentMetadata documentMetadata = documentMetadataList.stream().filter(metadata -> metadata.getSourceDocID().equals(idUri)).findFirst().orElse(null);
final Locale locale = LocaleUtils.toLocale(documentMetadata.getSourceDocLocale());
final boolean useApostrophe = Arrays.asList(documentMetadata.getSourceDocSepChars()).contains(String.valueOf(UploadFile.APOSTROPHE));
final String title = (documentMetadata.getSourceDocName() == null || documentMetadata.getSourceDocName().isEmpty()) ? documentId : documentMetadata.getSourceDocName();
progressListener.setProgress("Importing Document %1$s", title);
final File tempFile = new File(new File(tempDir), documentId);
if (tempFile.exists()) {
tempFile.delete();
}
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
IOUtils.copy(taIs, fos);
}
ui.accessSynchronously(() -> {
IDGenerator idGenerator = new IDGenerator();
IndexInfoSet indexInfoSet = new IndexInfoSet(Collections.emptyList(), useApostrophe ? Lists.newArrayList(UploadFile.APOSTROPHE) : Collections.emptyList(), locale);
TechInfoSet techInfoSet = new TechInfoSet(documentId, FileType.TEXT.getMimeType(), tempFile.toURI());
ContentInfoSet contentInfoSet = new ContentInfoSet(documentMetadata.getSourceDocAuthor(), documentMetadata.getSourceDocDescription(), documentMetadata.getSourceDocPublisher(), title);
techInfoSet.setCharset(Charset.forName("UTF-8"));
SourceDocumentInfo documentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
AbstractSourceContentHandler handler = null;
boolean loadIntrinsicMarkup = false;
if (entryName.endsWith("xml2")) {
handler = new XML2ContentHandler();
loadIntrinsicMarkup = true;
} else if (entryName.endsWith("xml")) {
handler = new OldXMLContentHandler();
loadIntrinsicMarkup = true;
} else {
handler = new StandardContentHandler();
}
handler.setSourceDocumentInfo(documentInfo);
SourceDocument document = new SourceDocument(documentId, handler);
try {
project.insert(document, false);
if (loadIntrinsicMarkup) {
final TagManager tagmanager = new TagManager(new TagLibrary());
XmlMarkupCollectionSerializationHandler markupHandler = new XmlMarkupCollectionSerializationHandler(tagmanager, (XML2ContentHandler) handler, project.getUser().getIdentifier());
try (FileInputStream fis = new FileInputStream(tempFile)) {
AnnotationCollection intrinsicMarkupCollection = markupHandler.deserialize(document, idGenerator.generateCollectionId(), fis);
Collection<TagsetImport> tagsetImports = new ArrayList<TagsetImport>();
String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
if (!tagset.isEmpty()) {
TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
boolean inProject = false;
if (targetTagset == null) {
targetTagset = tagset;
} else {
inProject = true;
}
String namespace = tagset.getName() == null ? "none" : tagset.getName();
if (tagset.getName() == null) {
tagset.setName(defaultIntrinsicXMLElmentsName);
}
TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
tagsetImports.add(tagsetImport);
}
}
// Creating Tagsets
tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_CREATED)).forEach(tagsetImport -> {
if (project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid()) != null) {
// already imported, so it will be a merge
tagsetImport.setImportState(TagsetImportState.WILL_BE_MERGED);
} else {
TagsetDefinition extractedTagset = tagsetImport.getExtractedTagset();
try {
project.importTagsets(Collections.singletonList(new TagsetDefinitionImportStatus(extractedTagset, project.inProjectHistory(extractedTagset.getUuid()), project.getTagManager().getTagLibrary().getTagsetDefinition(extractedTagset.getUuid()) != null)));
} catch (Exception e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error importing tagset %1$s with ID %2$s", extractedTagset.getName(), extractedTagset.getUuid()), e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error importing tagset %1$s! " + "This tagset will be skipped!\n The underlying error message was:\n%2$s", extractedTagset.getName(), errorMsg), Type.ERROR_MESSAGE);
}
}
});
// Merging Tagsets
tagsetImports.stream().filter(ti -> ti.getImportState().equals(TagsetImportState.WILL_BE_MERGED)).forEach(tagsetImport -> {
TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagsetImport.getTargetTagset().getUuid());
for (TagDefinition tag : tagsetImport.getExtractedTagset()) {
Optional<TagDefinition> optionalTag = targetTagset.getTagDefinitionsByName(tag.getName()).findFirst();
if (optionalTag.isPresent()) {
TagDefinition existingTag = optionalTag.get();
tag.getUserDefinedPropertyDefinitions().forEach(pd -> {
if (existingTag.getPropertyDefinition(pd.getName()) == null) {
project.getTagManager().addUserDefinedPropertyDefinition(existingTag, new PropertyDefinition(pd));
}
});
List<TagReference> tagReferences = intrinsicMarkupCollection.getTagReferences(tag);
intrinsicMarkupCollection.removeTagReferences(tagReferences);
Multimap<TagInstance, TagReference> referencesByInstance = ArrayListMultimap.create();
tagReferences.forEach(tr -> referencesByInstance.put(tr.getTagInstance(), tr));
for (TagInstance incomingTagInstance : referencesByInstance.keySet()) {
TagInstance newTagInstance = new TagInstance(idGenerator.generate(), existingTag.getUuid(), incomingTagInstance.getAuthor(), incomingTagInstance.getTimestamp(), existingTag.getUserDefinedPropertyDefinitions(), targetTagset.getUuid());
for (Property oldProp : incomingTagInstance.getUserDefinedProperties()) {
String oldPropDefId = oldProp.getPropertyDefinitionId();
PropertyDefinition oldPropDef = tag.getPropertyDefinitionByUuid(oldPropDefId);
PropertyDefinition existingPropDef = existingTag.getPropertyDefinition(oldPropDef.getName());
newTagInstance.addUserDefinedProperty(new Property(existingPropDef.getUuid(), oldProp.getPropertyValueList()));
}
ArrayList<TagReference> newReferences = new ArrayList<>();
referencesByInstance.get(incomingTagInstance).forEach(tr -> {
try {
newReferences.add(new TagReference(newTagInstance, tr.getTarget().toString(), tr.getRange(), tr.getUserMarkupCollectionUuid()));
} catch (URISyntaxException e) {
e.printStackTrace();
}
});
intrinsicMarkupCollection.addTagReferences(newReferences);
}
} else {
tag.setTagsetDefinitionUuid(targetTagset.getUuid());
project.getTagManager().addTagDefinition(targetTagset, tag);
}
}
});
project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
}
if (tempFile.exists()) {
tempFile.delete();
}
}
} catch (Exception e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, "Error importing the CATMA 5 Corpus: " + entryName, e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error importing the CATMA 5 Corpus! " + "This Document will be skipped!\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
}
});
}
entry = taIs.getNextTarEntry();
}
}
return null;
}
use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.
the class TeiSourceDocumentInfoSerializationHandler method deserialize.
private SourceDocumentInfo deserialize(TeiDocument teiDocument) {
ContentInfoSet contentInfoSet = teiDocument.getContentInfoSet();
TechInfoSet techInfoSet = teiDocument.getTechInfoset();
IndexInfoSet indexInfoSet = teiDocument.getIndexInfoSet();
return new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
}
use of de.catma.document.source.ContentInfoSet in project catma by forTEXT.
the class XmlMarkupCollectionSerializationHandler method deserialize.
@Override
public AnnotationCollection deserialize(SourceDocument sourceDocument, String id, InputStream inputStream) throws IOException {
try {
Builder builder = new Builder();
Document document = builder.build(inputStream);
Map<String, String> namespacePrefixToTagsetIdMap = new HashMap<>();
for (int idx = 0; idx < document.getRootElement().getNamespaceDeclarationCount(); idx++) {
String prefix = document.getRootElement().getNamespacePrefix(idx);
String namespaceURI = document.getRootElement().getNamespaceURI(prefix);
if (namespaceURI != null && !namespaceURI.isEmpty()) {
String tagsetId = idGenerator.generateTagsetId(namespaceURI);
if (tagManager.getTagLibrary().getTagsetDefinition(tagsetId) == null) {
TagsetDefinition tagsetDefinition = new TagsetDefinition(tagsetId, namespaceURI, new Version());
tagManager.addTagsetDefinition(tagsetDefinition);
}
namespacePrefixToTagsetIdMap.put(prefix, tagsetId);
}
}
String defaultIntrinsicXmlTagsetId = KnownTagsetDefinitionName.DEFAULT_INTRINSIC_XML.asTagsetId();
StringBuilder contentBuilder = new StringBuilder();
if (tagManager.getTagLibrary().getTagsetDefinition(defaultIntrinsicXmlTagsetId) == null) {
TagsetDefinition tagsetDefinition = new TagsetDefinition(defaultIntrinsicXmlTagsetId, null, new Version());
tagManager.addTagsetDefinition(tagsetDefinition);
}
Stack<String> elementStack = new Stack<String>();
AnnotationCollection userMarkupCollection = new AnnotationCollection(id, new ContentInfoSet("", "Intrinsic Markup", "", DEFAULT_COLLECTION_TITLE), tagManager.getTagLibrary(), sourceDocument.getUuid(), sourceDocument.getRevisionHash());
scanElements(contentBuilder, document.getRootElement(), elementStack, tagManager, tagManager.getTagLibrary(), namespacePrefixToTagsetIdMap, userMarkupCollection, sourceDocument.getUuid(), sourceDocument.getLength());
return userMarkupCollection;
} catch (Exception e) {
throw new IOException(e);
}
}
Aggregations