use of de.catma.document.source.SourceDocumentInfo in project catma by forTEXT.
the class TeiSourceDocumentInfoSerializationHandler method serialize.
public void serialize(SourceDocument sourceDocument, OutputStream outputStream) throws IOException {
try {
TeiDocumentFactory factory = new TeiDocumentFactory();
TeiDocument teiDocument = factory.createEmptyDocument(sourceDocument.getUuid());
SourceDocumentInfo sourceDocumentInfo = sourceDocument.getSourceContentHandler().getSourceDocumentInfo();
teiDocument.getTeiHeader().setValues(sourceDocumentInfo.getContentInfoSet(), sourceDocumentInfo.getTechInfoSet(), sourceDocumentInfo.getIndexInfoSet());
DocumentSerializer serializer = new DocumentSerializer();
serializer.serialize(teiDocument.getDocument(), outputStream);
} catch (Exception exc) {
throw new IOException(exc);
}
}
use of de.catma.document.source.SourceDocumentInfo in project catma by forTEXT.
the class ProjectView method addUploadFile.
private void addUploadFile(UploadFile uploadFile, boolean useApostropheAsSeparator, String collectionNamePattern) {
SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(uploadFile.getIndexInfoSet(useApostropheAsSeparator), uploadFile.getContentInfoSet(), uploadFile.getTechInfoSet());
SourceContentHandler contentHandler = sourceDocumentInfo.getTechInfoSet().getMimeType().equals(FileType.XML2.getMimeType()) ? new XML2ContentHandler() : new TikaContentHandler();
contentHandler.setSourceDocumentInfo(sourceDocumentInfo);
SourceDocument document = new SourceDocument(uploadFile.getUuid(), contentHandler);
try {
String content = document.getContent();
FileOSType fileOSType = FileOSType.getFileOSType(content);
sourceDocumentInfo.getTechInfoSet().setFileOSType(fileOSType);
CRC32 checksum = new CRC32();
checksum.update(content.getBytes());
sourceDocumentInfo.getTechInfoSet().setChecksum(checksum.getValue());
project.insert(document);
AnnotationCollection intrinsicMarkupCollection = uploadFile.getIntrinsicMarkupCollection();
if (intrinsicMarkupCollection != null) {
project.importCollection(Collections.emptyList(), intrinsicMarkupCollection);
}
if (collectionNamePattern != null && !collectionNamePattern.isEmpty()) {
String collectionName = collectionNamePattern.replace("{{Title}}", uploadFile.getTitle());
project.createUserMarkupCollection(collectionName, document);
}
} catch (IOException e) {
Logger.getLogger(ProjectView.class.getName()).log(Level.SEVERE, String.format("Error loading content of %1$s", uploadFile.getTempFilename().toString()), e);
String errorMsg = e.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error loading content of %1$s! " + "This document will be skipped!\n The underlying error message was:\n%2$s", uploadFile.getTitle(), errorMsg), Type.ERROR_MESSAGE);
}
}
use of de.catma.document.source.SourceDocumentInfo in project catma by forTEXT.
the class ImportIntrinsicMarkupStep method enter.
@Override
public void enter(boolean back) {
if (back) {
return;
}
contentPanel.setEnabled(false);
progressBar.setVisible(true);
progressBar.setIndeterminate(true);
@SuppressWarnings("unchecked") final ArrayList<UploadFile> files = new ArrayList<UploadFile>(((Collection<UploadFile>) wizardContext.get(DocumentWizard.WizardContextKey.UPLOAD_FILE_LIST)).stream().filter(uploadFile -> uploadFile.getMimetype().equals(FileType.XML2.getMimeType())).collect(Collectors.toList()));
final TagManager tagmanager = new TagManager(new TagLibrary());
BackgroundServiceProvider backgroundServiceProvider = (BackgroundServiceProvider) UI.getCurrent();
backgroundServiceProvider.submit("inspecting-intrinsic-markup", new DefaultProgressCallable<List<UploadFile>>() {
@Override
public List<UploadFile> call() throws Exception {
IDGenerator idGenerator = new IDGenerator();
for (UploadFile uploadFile : files) {
XML2ContentHandler contentHandler = new XML2ContentHandler();
SourceDocument doc = new SourceDocument(uploadFile.getUuid(), contentHandler);
SourceDocumentInfo documentInfo = new SourceDocumentInfo();
TechInfoSet techInfoSet = new TechInfoSet();
techInfoSet.setURI(uploadFile.getTempFilename());
documentInfo.setTechInfoSet(techInfoSet);
contentHandler.setSourceDocumentInfo(documentInfo);
XmlMarkupCollectionSerializationHandler handler = new XmlMarkupCollectionSerializationHandler(tagmanager, contentHandler, project.getUser().getIdentifier());
try (FileInputStream fis = new FileInputStream(new File(uploadFile.getTempFilename()))) {
AnnotationCollection collection = handler.deserialize(doc, idGenerator.generateCollectionId(), fis);
uploadFile.setIntrinsicMarkupCollection(collection);
}
}
return files;
}
}, new ExecutionListener<List<UploadFile>>() {
@Override
public void done(List<UploadFile> result) {
contentPanel.setEnabled(true);
progressBar.setVisible(false);
progressBar.setIndeterminate(false);
fileList.clear();
fileList.addAll(result);
fileDataProvider.refreshAll();
tagsetImportList.clear();
String defaultIntrinsicXMLElmentsName = "Default Intrinsic XML Elements";
for (TagsetDefinition tagset : tagmanager.getTagLibrary()) {
if (!tagset.isEmpty()) {
TagsetDefinition targetTagset = project.getTagManager().getTagLibrary().getTagsetDefinition(tagset.getUuid());
boolean inProject = false;
if (targetTagset == null) {
targetTagset = tagset;
} else {
inProject = true;
}
String namespace = tagset.getName() == null ? "none" : tagset.getName();
if (tagset.getName() == null) {
tagset.setName(defaultIntrinsicXMLElmentsName);
}
TagsetImport tagsetImport = new TagsetImport(namespace, tagset, targetTagset, inProject ? TagsetImportState.WILL_BE_MERGED : TagsetImportState.WILL_BE_CREATED);
tagsetImportList.add(tagsetImport);
}
}
tagsetDataProvider.refreshAll();
wizardContext.put(DocumentWizard.WizardContextKey.TAGSET_IMPORT_LIST, tagsetImportList);
if (stepChangeListener != null) {
stepChangeListener.stepChanged(ImportIntrinsicMarkupStep.this);
}
}
@Override
public void error(Throwable t) {
Logger.getLogger(ImportIntrinsicMarkupStep.class.getName()).log(Level.SEVERE, "Error inspecting files", t);
String errorMsg = t.getMessage();
if ((errorMsg == null) || (errorMsg.trim().isEmpty())) {
errorMsg = "";
}
Notification.show("Error", String.format("Error inspecting the contents! " + "\n The underlying error message was:\n%1$s", errorMsg), Type.ERROR_MESSAGE);
}
});
}
use of de.catma.document.source.SourceDocumentInfo in project catma by forTEXT.
the class GraphWriter method addDocument.
void addDocument(Vertex projectRevV, SourceDocument document) throws Exception {
logger.info("Starting to add Document " + document + " to the graph");
Vertex documentV = graph.addVertex(nt(SourceDocument));
SourceDocumentInfo info = document.getSourceContentHandler().getSourceDocumentInfo();
info.getTechInfoSet().setURI(fileInfoProvider.getSourceDocumentFileURI(document.getUuid()));
documentV.property("documentId", document.getUuid());
// documentV.property("author", info.getContentInfoSet().getAuthor());
// documentV.property("description", info.getContentInfoSet().getDescription());
// documentV.property("publisher", info.getContentInfoSet().getPublisher());
// documentV.property("title", info.getContentInfoSet().getTitle());
// documentV.property("checsum", info.getTechInfoSet().getChecksum());
// documentV.property("charset", info.getTechInfoSet().getCharset());
// documentV.property("fileOSType", info.getTechInfoSet().getFileOSType());
// documentV.property("fileType", info.getTechInfoSet().getFileType());
// documentV.property("mimeType", info.getTechInfoSet().getMimeType());
// documentV.property("locale", info.getIndexInfoSet().getLocale());
documentV.property("document", document);
// TODO: necessary?
// documentV.property("unseparableCharacterSequences", info.getIndexInfoSet().getUnseparableCharacterSequences());
// documentV.property("userDefinedSeparatingCharacters", info.getIndexInfoSet().getUserDefinedSeparatingCharacters());
projectRevV.addEdge(rt(hasDocument), documentV);
try {
Path tokensPath = fileInfoProvider.getTokenizedSourceDocumentPath(document.getUuid());
@SuppressWarnings("rawtypes") Map content = new Gson().fromJson(FileUtils.readFileToString(tokensPath.toFile(), "UTF-8"), Map.class);
Map<Integer, Vertex> adjacencyMap = new HashMap<>();
for (Object entry : content.entrySet()) {
String term = (String) ((Map.Entry) entry).getKey();
Vertex termV = graph.addVertex(nt(Term));
termV.property("literal", term);
List positionList = (List) ((Map.Entry) entry).getValue();
termV.property("freq", positionList.size());
termV.addEdge(rt(isPartOf), documentV);
for (Object posEntry : positionList) {
int startOffset = ((Double) ((Map) posEntry).get("startOffset")).intValue();
int endOffset = ((Double) ((Map) posEntry).get("endOffset")).intValue();
int tokenOffset = ((Double) ((Map) posEntry).get("tokenOffset")).intValue();
Vertex positionV = graph.addVertex(nt(Position));
positionV.property("startOffset", startOffset);
positionV.property("endOffset", endOffset);
positionV.property("tokenOffset", tokenOffset);
termV.addEdge(rt(hasPosition), positionV);
adjacencyMap.put(tokenOffset, positionV);
}
}
for (int i = 0; i < adjacencyMap.size() - 1; i++) {
adjacencyMap.get(i).addEdge(rt(isAdjacentTo), adjacencyMap.get(i + 1));
}
logger.info("Finished adding Document " + document + " to the graph");
} catch (Exception e) {
logger.log(Level.SEVERE, String.format("error loading tokens for Document %1$s in project %2$s", document.getUuid(), projectReference.getProjectId()), e);
}
}
use of de.catma.document.source.SourceDocumentInfo in project catma by forTEXT.
the class GitProjectHandlerTest method createSourceDocument.
// @Test
// public void delete() throws Exception {
// try (ILocalGitRepositoryManager jGitRepoManager = new JGitRepoManager(this.catmaProperties.getProperty(CATMAPropertyKey.GitBasedRepositoryBasePath.name()), this.catmaUser)) {
// this.directoriesToDeleteOnTearDown.add(jGitRepoManager.getRepositoryBasePath());
//
// GitProjectManager gitProjectHandler = new GitProjectManager(
// CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(),
// UserIdentification.userToMap(this.catmaUser.getIdentifier()));
//
//
// String projectId = gitProjectHandler.create(
// "Test CATMA Project", "This is a test CATMA project"
// );
// // we don't add the projectId to this.projectsToDeleteOnTearDown as this is the delete test
//
// assertNotNull(projectId);
// assert projectId.startsWith("CATMA_");
//
// // the JGitRepoManager instance should always be in a detached state after GitProjectHandler calls
// // return
// assertFalse(jGitRepoManager.isAttached());
//
// String expectedRootRepositoryName = GitProjectManager.getProjectRootRepositoryName(projectId);
//
// File expectedRootRepositoryPath = new File(
// jGitRepoManager.getRepositoryBasePath(), expectedRootRepositoryName
// );
//
// assert expectedRootRepositoryPath.exists();
// assert expectedRootRepositoryPath.isDirectory();
//
// gitProjectHandler.delete(projectId);
//
// assertFalse(expectedRootRepositoryPath.exists());
//
// // the JGitRepoManager instance should always be in a detached state after GitProjectHandler calls
// // return
// assertFalse(jGitRepoManager.isAttached());
// }
// }
//
// @Test
// public void createTagset() throws Exception {
// try (JGitRepoManager jGitRepoManager = new JGitRepoManager(this.catmaProperties.getProperty(CATMAPropertyKey.GitBasedRepositoryBasePath.name()), this.catmaUser)) {
// this.directoriesToDeleteOnTearDown.add(jGitRepoManager.getRepositoryBasePath());
//
// GitProjectManager gitProjectManager = new GitProjectManager(
// CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(),
// UserIdentification.userToMap(this.catmaUser.getIdentifier()));
//
//
// String projectId = gitProjectManager.create(
// "Test CATMA Project",
// "This is a test CATMA project"
// );
// this.projectsToDeleteOnTearDown.add(projectId);
//
// // the JGitRepoManager instance should always be in a detached state after GitProjectHandler calls return
// assertFalse(jGitRepoManager.isAttached());
//
// GitProjectHandler gitProjectHandler = new GitProjectHandler(null, projectId, jGitRepoManager, gitLabServerManager);
//
// String tagsetId = gitProjectHandler.createTagset(
//
// null,
// "Test Tagset",
// null
// );
//
// assertNotNull(tagsetId);
//
// // the JGitRepoManager instance should always be in a detached state after GitProjectHandler calls return
// assertFalse(jGitRepoManager.isAttached());
//
// jGitRepoManager.open(projectId, GitProjectManager.getProjectRootRepositoryName(projectId));
// Status status = jGitRepoManager.getGitApi().status().call();
// Set<String> added = status.getAdded();
//
// assert status.hasUncommittedChanges();
// assert added.contains(".gitmodules");
// assert added.contains(String.format("%s/%s", GitProjectHandler.TAGSET_SUBMODULES_DIRECTORY_NAME, tagsetId));
// }
// }
//
// @Test
// public void createMarkupCollection() throws Exception {
// try (JGitRepoManager jGitRepoManager = new JGitRepoManager(this.catmaProperties.getProperty(CATMAPropertyKey.GitBasedRepositoryBasePath.name()), this.catmaUser)) {
// this.directoriesToDeleteOnTearDown.add(jGitRepoManager.getRepositoryBasePath());
//
// GitProjectManager gitProjectManager = new GitProjectManager(
// CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(),
// UserIdentification.userToMap(this.catmaUser.getIdentifier()));
//
//
// String projectId = gitProjectManager.create(
// "Test CATMA Project",
// "This is a test CATMA project"
// );
// this.projectsToDeleteOnTearDown.add(projectId);
//
// // the JGitRepoManager instance should always be in a detached state after GitProjectHandler calls return
// assertFalse(jGitRepoManager.isAttached());
//
// GitProjectHandler gitProjectHandler = new GitProjectHandler(null, projectId, jGitRepoManager, gitLabServerManager);
//
// String markupCollectionId = gitProjectHandler.createMarkupCollection(
// null,
// "Test Markup Collection",
// null,
// "fakeSourceDocumentId",
// "fakeSourceDocumentVersion"
// );
//
// assertNotNull(markupCollectionId);
//
// // the JGitRepoManager instance should always be in a detached state after GitProjectHandler calls return
// assertFalse(jGitRepoManager.isAttached());
//
// jGitRepoManager.open(projectId, GitProjectManager.getProjectRootRepositoryName(projectId));
// Status status = jGitRepoManager.getGitApi().status().call();
// Set<String> added = status.getAdded();
//
// assert status.hasUncommittedChanges();
// assert added.contains(".gitmodules");
// assert added.contains(
// String.format(
// "%s/%s", GitProjectHandler.MARKUP_COLLECTION_SUBMODULES_DIRECTORY_NAME, markupCollectionId
// )
// );
// }
// }
@Test
public void createSourceDocument() throws Exception {
File originalSourceDocument = new File("testdocs/rose_for_emily.pdf");
File convertedSourceDocument = new File("testdocs/rose_for_emily.txt");
FileInputStream originalSourceDocumentStream = new FileInputStream(originalSourceDocument);
FileInputStream convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
IndexInfoSet indexInfoSet = new IndexInfoSet();
indexInfoSet.setLocale(Locale.ENGLISH);
ContentInfoSet contentInfoSet = new ContentInfoSet("William Faulkner", "", "", "A Rose for Emily");
TechInfoSet techInfoSet = new TechInfoSet(FileType.TEXT, StandardCharsets.UTF_8, FileOSType.DOS, 705211438L);
SourceDocumentInfo sourceDocumentInfo = new SourceDocumentInfo(indexInfoSet, contentInfoSet, techInfoSet);
Map<String, List<TermInfo>> terms = new TermExtractor(IOUtils.toString(convertedSourceDocumentStream, techInfoSet.getCharset()), new ArrayList<>(), new ArrayList<>(), indexInfoSet.getLocale()).getTerms();
// need to re-instantiate the stream, otherwise an empty file will be written later on (FileInputStream does not support `reset`)
convertedSourceDocumentStream = new FileInputStream(convertedSourceDocument);
String sourceDocumentUuid = new IDGenerator().generateDocumentId();
// GraphWorktreeProject.TOKENIZED_FILE_EXTENSION
String tokenizedSourceDocumentFileName = sourceDocumentUuid + "." + "json";
try (JGitRepoManager jGitRepoManager = new JGitRepoManager(CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(), gitlabManagerRestricted.getUser())) {
directoriesToDeleteOnTearDown.add(jGitRepoManager.getRepositoryBasePath());
BackgroundService mockBackgroundService = mock(BackgroundService.class);
EventBus mockEventBus = mock(EventBus.class);
GitProjectManager gitProjectManager = new GitProjectManager(CATMAPropertyKey.GitBasedRepositoryBasePath.getValue(), gitlabManagerRestricted, // noop deletion handler
(projectId) -> {
}, mockBackgroundService, mockEventBus);
String projectId = gitProjectManager.create("Test CATMA Project", "This is a test CATMA project");
// we don't add the projectId to projectsToDeleteOnTearDown as deletion of the user will take care of that for us
// the JGitRepoManager instance should always be in a detached state after GitProjectManager calls return
assertFalse(jGitRepoManager.isAttached());
GitProjectHandler gitProjectHandler = new GitProjectHandler(gitlabManagerRestricted.getUser(), projectId, jGitRepoManager, gitlabManagerRestricted);
// would usually happen when the project is opened via GraphWorktreeProject
gitProjectHandler.loadRolesPerResource();
String revisionHash = gitProjectHandler.createSourceDocument(sourceDocumentUuid, originalSourceDocumentStream, originalSourceDocument.getName(), convertedSourceDocumentStream, convertedSourceDocument.getName(), terms, tokenizedSourceDocumentFileName, sourceDocumentInfo);
assertNotNull(revisionHash);
// the JGitRepoManager instance should always be in a detached state after GitProjectHandler calls return
assertFalse(jGitRepoManager.isAttached());
jGitRepoManager.open(projectId, GitProjectManager.getProjectRootRepositoryName(projectId));
Status status = jGitRepoManager.getGitApi().status().call();
assert status.isClean();
assertFalse(status.hasUncommittedChanges());
Iterable<RevCommit> commits = jGitRepoManager.getGitApi().log().all().call();
@SuppressWarnings("unchecked") List<RevCommit> commitsList = IteratorUtils.toList(commits.iterator());
assertEquals(1, commitsList.size());
// TODO: it would be good to check that the revision hash of the commit matches, however GitProjectHandler currently returns the revision hash
// from the source document repo itself rather than from the root repo
assertEquals(gitlabManagerRestricted.getUser().getIdentifier(), commitsList.get(0).getCommitterIdent().getName());
assertEquals(gitlabManagerRestricted.getUser().getEmail(), commitsList.get(0).getCommitterIdent().getEmailAddress());
assert commitsList.get(0).getFullMessage().contains(String.format("Added Document %s with ID", contentInfoSet.getTitle()));
// TODO: add assertions for actual paths changed (see commented above - would need to be modified for already committed changes)
}
}
Aggregations