use of de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus in project webanno by webanno.
the class AutomationUtil method generateTrainDocument.
public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
LOG.info("Starting to generate training document");
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
if (!miraDir.exists()) {
FileUtils.forceMkdir(miraDir);
}
AnnotationFeature feature = aTemplate.getTrainFeature();
boolean documentChanged = false;
// A. training document for other train layers were changed
for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
documentChanged = true;
break;
}
}
}
// B. Training document for the main training layer were changed
for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
documentChanged = true;
break;
}
}
// C. New Curation document arrives
if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
documentChanged = true;
}
// D. tab-sep training documents
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
documentChanged = true;
break;
}
}
if (!documentChanged) {
return;
}
File trainFile;
if (aBase) {
trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
} else {
trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
}
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
// Training documents (Curated or webanno-compatible imported ones - read using UIMA)
List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
int trainingDocsCount = 0;
for (TrainingDocument trainingDocument : trainingDocuments) {
if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
for (Sentence sentence : select(jCas, Sentence.class)) {
if (aBase) {
// base training document
trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
} else {
// training document with other features
trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
}
}
trainingDocument.setProcessed(!aBase);
if (!aBase) {
status.setTrainDocs(status.getTrainDocs() - 1);
}
}
}
// for curated docuemnts
List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
for (SourceDocument sourceDocument : sourceDocuments) {
if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
for (Sentence sentence : select(jCas, Sentence.class)) {
if (aBase) {
// base training document
trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
} else {
// training document with other features
trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
}
}
if (!aBase) {
status.setTrainDocs(status.getTrainDocs() - 1);
}
}
trainingDocsCount++;
LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
}
// Tab-sep documents to be used as a target layer train document
int goldStandardDocsCounter = 0;
List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
for (TrainingDocument document : goldStandardDocs) {
if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
trainOut.append("\n");
} else {
StringTokenizer st = new StringTokenizer(line, "\t");
if (st.countTokens() != 2) {
trainOut.close();
throw new AutomationException("This is not a valid TAB-SEP document");
}
if (aBase) {
trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
} else {
trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
}
}
}
}
goldStandardDocsCounter++;
LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
}
trainOut.close();
LOG.info("Completed generating training document");
}
use of de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus in project webanno by webanno.
the class AutomationUtil method predict.
public static void predict(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException {
AnnotationFeature layerFeature = aTemplate.getTrainFeature();
File miraDir = aAutomationService.getMiraDir(layerFeature);
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
File predFile = new File(miraDir, document.getId() + ".pred");
Mira mira = new Mira();
int shiftColumns = 0;
int nbest = 1;
int beamSize = 0;
boolean maxPosteriors = false;
String modelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
String testName = predFile.getAbsolutePath();
File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
PrintStream stream = new PrintStream(predcitedFile);
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
if (testName != null) {
input = new BufferedReader(new FileReader(testName));
}
mira.loadModel(modelName);
mira.setShiftColumns(shiftColumns);
mira.nbest = nbest;
mira.beamSize = beamSize;
mira.maxPosteriors = maxPosteriors;
mira.test(input, stream);
LOG.info("Prediction is wrtten to a MIRA File. To be done is writing back to the CAS");
LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
List<String> annotations = new ArrayList<>();
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
continue;
}
StringTokenizer st = new StringTokenizer(line, " ");
String tag = "";
while (st.hasMoreTokens()) {
tag = st.nextToken();
}
annotations.add(tag);
}
LOG.info(annotations.size() + " Predictions found to be written to the CAS");
JCas jCas = null;
User user = aUserDao.getCurrentUser();
try {
AnnotationDocument annoDocument = aRepository.getAnnotationDocument(document, user);
jCas = aRepository.readAnnotationCas(annoDocument);
automate(jCas, layerFeature, annotations);
} catch (DataRetrievalFailureException e) {
automate(jCas, layerFeature, annotations);
LOG.info("Predictions found are written to the CAS");
aCorrectionDocumentService.writeCorrectionCas(jCas, document);
status.setAnnoDocs(status.getAnnoDocs() - 1);
}
automate(jCas, layerFeature, annotations);
LOG.info("Predictions found are written to the CAS");
aCorrectionDocumentService.writeCorrectionCas(jCas, document);
status.setAnnoDocs(status.getAnnoDocs() - 1);
}
}
use of de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus in project webanno by webanno.
the class AutomationUtil method addTabSepTrainDocument.
public static void addTabSepTrainDocument(MiraTemplate aTemplate, AutomationService aAutomationService) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
if (!miraDir.exists()) {
FileUtils.forceMkdir(miraDir);
}
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
boolean documentChanged = false;
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed()) {
documentChanged = true;
break;
}
}
if (!documentChanged) {
return;
}
for (TrainingDocument trainingDocument : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (trainingDocument.getFeature() != null) {
// This is a target layer train document
continue;
}
File trainFile = new File(miraDir, trainingDocument.getId() + trainingDocument.getProject().getId() + ".train");
BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
File tabSepFile = new File(aAutomationService.getDocumentFolder(trainingDocument), trainingDocument.getName());
LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
trainOut.append("\n");
} else {
StringTokenizer st = new StringTokenizer(line, "\t");
if (st.countTokens() != 2) {
trainOut.close();
throw new AutomationException("This is not a valid TAB-SEP document");
}
trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
}
}
trainingDocument.setProcessed(false);
status.setTrainDocs(status.getTrainDocs() - 1);
trainOut.close();
}
}
use of de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus in project webanno by webanno.
the class AutomationUtil method addOtherFeatureTrainDocument.
// generates training document that will be used to predict the training document
// to add extra features, for example add POS tag as a feature for NE classifier
public static void addOtherFeatureTrainDocument(MiraTemplate aTemplate, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException {
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
if (!miraDir.exists()) {
FileUtils.forceMkdir(miraDir);
}
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
File trainFile = new File(miraDir, feature.getId() + ".train");
boolean documentChanged = false;
for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
documentChanged = true;
break;
}
}
if (!documentChanged && trainFile.exists()) {
continue;
}
BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
for (TrainingDocument trainingDocument : aAutomationService.listTrainingDocuments(feature.getProject())) {
if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature))) {
JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
for (Sentence sentence : select(jCas, Sentence.class)) {
trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
}
trainingDocument.setProcessed(false);
status.setTrainDocs(status.getTrainDocs() - 1);
}
}
trainOut.close();
}
}
Aggregations