use of org.corpus_tools.salt.common.SCorpus in project ANNIS by korpling.
the class QueryDaoImpl method exportCorpus.
@Override
@Transactional(readOnly = true)
public void exportCorpus(String toplevelCorpus, File outputDirectory) {
// check if the corpus really exists
mapCorpusNameToId(toplevelCorpus);
SaltProject corpusProject = SaltFactory.createSaltProject();
SCorpusGraph corpusGraph = SaltFactory.createSCorpusGraph();
corpusGraph.setSaltProject(corpusProject);
SCorpus rootCorpus = corpusGraph.createCorpus(null, toplevelCorpus);
// add all root metadata
for (Annotation metaAnno : listCorpusAnnotations(toplevelCorpus)) {
rootCorpus.createMetaAnnotation(metaAnno.getNamespace(), metaAnno.getName(), metaAnno.getValue());
}
File documentRootDir = new File(outputDirectory, toplevelCorpus);
if (!outputDirectory.exists()) {
if (!outputDirectory.mkdirs()) {
log.warn("Could not create output directory \"{}\" for exporting the corpus", outputDirectory.getAbsolutePath());
}
}
List<Annotation> docs = listDocuments(toplevelCorpus);
int i = 1;
for (Annotation docAnno : docs) {
log.info("Loading document {} from database ({}/{})", docAnno.getName(), i, docs.size());
SaltProject docProject = retrieveAnnotationGraph(toplevelCorpus, docAnno.getName(), null);
if (docProject != null && docProject.getCorpusGraphs() != null && !docProject.getCorpusGraphs().isEmpty()) {
List<Annotation> docMetaData = listCorpusAnnotations(toplevelCorpus, docAnno.getName(), true);
SCorpusGraph docCorpusGraph = docProject.getCorpusGraphs().get(0);
// TODO: we could re-use the actual corpus structure instead of just adding a flat list of documents
if (docCorpusGraph.getDocuments() != null) {
for (SDocument doc : docCorpusGraph.getDocuments()) {
log.info("Removing SFeatures from {} ({}/{})", docAnno.getName(), i, docs.size());
// remove all ANNIS specific features that require a special Java class
SDocumentGraph graph = doc.getDocumentGraph();
if (graph != null) {
if (graph.getNodes() != null) {
for (SNode n : graph.getNodes()) {
n.removeLabel(AnnisConstants.ANNIS_NS, AnnisConstants.FEAT_RELANNIS_NODE);
}
}
if (graph.getRelations() != null) {
for (SRelation e : graph.getRelations()) {
e.removeLabel(AnnisConstants.ANNIS_NS, AnnisConstants.FEAT_RELANNIS_EDGE);
}
}
}
log.info("Saving document {} ({}/{})", doc.getName(), i, docs.size());
SaltUtil.saveDocumentGraph(graph, URI.createFileURI(new File(documentRootDir, doc.getName() + "." + SaltUtil.FILE_ENDING_SALT_XML).getAbsolutePath()));
SDocument docCopy = corpusGraph.createDocument(rootCorpus, doc.getName());
log.info("Adding metadata to document {} ({}/{})", doc.getName(), i, docs.size());
for (Annotation metaAnno : docMetaData) {
docCopy.createMetaAnnotation(metaAnno.getNamespace(), metaAnno.getName(), metaAnno.getValue());
}
}
}
}
i++;
}
// end for each document
// save the actual SaltProject
log.info("Saving corpus structure");
File projectFile = new File(outputDirectory, SaltUtil.FILE_SALT_PROJECT);
SaltXML10Writer writer = new SaltXML10Writer(projectFile);
writer.writeSaltProject(corpusProject);
}
use of org.corpus_tools.salt.common.SCorpus in project ANNIS by korpling.
the class CommonHelper method getCorpusPath.
public static List<String> getCorpusPath(SCorpusGraph corpusGraph, SDocument doc) {
final List<String> result = new LinkedList<String>();
result.add(doc.getName());
SCorpus c = corpusGraph.getCorpus(doc);
List<SNode> cAsList = new ArrayList<>();
cAsList.add(c);
corpusGraph.traverse(cAsList, GRAPH_TRAVERSE_TYPE.BOTTOM_UP_DEPTH_FIRST, "getRootCorpora", new GraphTraverseHandler() {
@Override
public void nodeReached(GRAPH_TRAVERSE_TYPE traversalType, String traversalId, SNode currNode, SRelation edge, SNode fromNode, long order) {
result.add(currNode.getName());
}
@Override
public void nodeLeft(GRAPH_TRAVERSE_TYPE traversalType, String traversalId, SNode currNode, SRelation edge, SNode fromNode, long order) {
}
@Override
public boolean checkConstraint(GRAPH_TRAVERSE_TYPE traversalType, String traversalId, SRelation edge, SNode currNode, long order) {
return true;
}
});
return result;
}
use of org.corpus_tools.salt.common.SCorpus in project ANNIS by korpling.
the class SaltAnnotateExtractor method extractData.
@Override
public SaltProject extractData(ResultSet resultSet) throws SQLException, DataAccessException {
SaltProject project = SaltFactory.createSaltProject();
try {
SCorpusGraph corpusGraph = null;
SDocumentGraph graph = null;
// fn: parent information (pre and component) id to node
FastInverseMap<Long, SNode> nodeByRankID = new FastInverseMap<>();
TreeSet<Long> allTextIDs = new TreeSet<>();
TreeMap<Long, String> tokenTexts = new TreeMap<>();
TreeMap<Long, SToken> tokenByIndex = new TreeMap<>();
TreeMap<String, TreeMap<Long, String>> nodeBySegmentationPath = new TreeMap<>();
Map<String, ComponentEntry> componentForSpan = new HashMap<>();
// clear mapping functions for this graph
// assumes that the result set is sorted by key, pre
nodeByRankID.clear();
SDocument document = null;
AtomicInteger numberOfRelations = new AtomicInteger();
int match_index = 0;
SolutionKey<?> key = createSolutionKey();
int counter = 0;
while (resultSet.next()) {
if (counter % 1000 == 0) {
log.debug("handling resultset row {}", counter);
}
counter++;
// List<String> annotationGraphKey =
key.retrieveKey(resultSet);
if (key.isNewKey()) {
// create the text for the last graph
if (graph != null && document != null) {
createMissingSpanningRelations(graph, nodeByRankID, tokenByIndex, componentForSpan, numberOfRelations);
createPrimaryTexts(graph, allTextIDs, tokenTexts, tokenByIndex);
addOrderingRelations(graph, nodeBySegmentationPath);
}
// new match, reset everything
nodeByRankID.clear();
tokenTexts.clear();
tokenByIndex.clear();
componentForSpan.clear();
Integer matchstart = resultSet.getInt("matchstart");
corpusGraph = SaltFactory.createSCorpusGraph();
corpusGraph.setName("match_" + (match_index + matchstart));
project.addCorpusGraph(corpusGraph);
graph = SaltFactory.createSDocumentGraph();
document = SaltFactory.createSDocument();
document.setDocumentGraphLocation(org.eclipse.emf.common.util.URI.createFileURI(Files.createTempDir().getAbsolutePath()));
List<String> path = corpusPathExtractor.extractCorpusPath(resultSet, "path");
SCorpus toplevelCorpus = SaltFactory.createSCorpus();
toplevelCorpus.setName(path.get(0));
corpusGraph.addNode(toplevelCorpus);
Validate.isTrue(path.size() >= 2, "Corpus path must be have at least two members (toplevel and document)");
SCorpus corpus = toplevelCorpus;
for (int i = 1; i < path.size() - 1; i++) {
SCorpus subcorpus = SaltFactory.createSCorpus();
subcorpus.setName(path.get(i));
corpusGraph.addSubCorpus(corpus, subcorpus);
corpus = subcorpus;
}
document.setName(path.get(path.size() - 1));
document.setId("" + match_index);
corpusGraph.addDocument(corpus, document);
document.setDocumentGraph(graph);
match_index++;
}
// end if new key
// get node data
SNode node = createOrFindNewNode(resultSet, graph, allTextIDs, tokenTexts, tokenByIndex, nodeBySegmentationPath, key, nodeByRankID);
long rankID = longValue(resultSet, RANK_TABLE, "id");
long componentID = longValue(resultSet, COMPONENT_TABLE, "id");
if (!resultSet.wasNull()) {
nodeByRankID.put(rankID, node);
createRelation(resultSet, graph, nodeByRankID, node, numberOfRelations);
if (node instanceof SSpan) {
componentForSpan.put(node.getId(), new ComponentEntry(componentID, 'c', stringValue(resultSet, COMPONENT_TABLE, "namespace"), stringValue(resultSet, COMPONENT_TABLE, "name")));
}
}
}
// the last match needs a primary text, too
if (graph != null) {
createMissingSpanningRelations(graph, nodeByRankID, tokenByIndex, componentForSpan, numberOfRelations);
createPrimaryTexts(graph, allTextIDs, tokenTexts, tokenByIndex);
addOrderingRelations(graph, nodeBySegmentationPath);
}
} catch (Exception ex) {
log.error("could not map result set to SaltProject", ex);
}
return project;
}
use of org.corpus_tools.salt.common.SCorpus in project ANNIS by korpling.
the class SaltProjectProvider method readFrom.
@Override
public SaltProject readFrom(Class<SaltProject> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, String> httpHeaders, InputStream entityStream) throws IOException, WebApplicationException {
SaltProject result = SaltFactory.createSaltProject();
SAXParser parser;
XMLReader xmlReader;
SAXParserFactory factory = SAXParserFactory.newInstance();
MixedContentHandler handler = new MixedContentHandler();
try {
parser = factory.newSAXParser();
xmlReader = parser.getXMLReader();
xmlReader.setContentHandler(handler);
InputSource source = new InputSource(entityStream);
source.setEncoding("UTF-8");
xmlReader.parse(source);
for (SDocumentGraph g : handler.getDocGraphs()) {
// create a separate corpus graph for each document
SCorpusGraph corpusGraph = SaltFactory.createSCorpusGraph();
SCorpus parentCorpus = null;
SDocument doc = null;
List<SNode> nodes = g.getNodes();
Iterator<String> it;
if (nodes != null && !nodes.isEmpty()) {
// the path of each node ID is always the document/corpus path
it = nodes.get(0).getPath().segmentsList().iterator();
} else {
// Old salt versions had a separate ID for the document graph
// which was the document name with the suffix "_graph".
// Thus this method of getting the corpus path is only the fallback.
it = g.getPath().segmentsList().iterator();
}
while (it.hasNext()) {
String name = it.next();
if (it.hasNext()) {
// this is a sub-corpus
parentCorpus = corpusGraph.createCorpus(parentCorpus, name);
} else {
// no more path elements left, must be a document
doc = corpusGraph.createDocument(parentCorpus, name);
break;
}
}
if (doc != null) {
result.addCorpusGraph(corpusGraph);
doc.setDocumentGraph(g);
}
}
} catch (ParserConfigurationException | SAXException ex) {
log.error("Error when parsing XMI", ex);
}
return result;
}
Aggregations