use of edu.lium.mira.Mira in project webanno by webanno.
the class AutomationUtil method generateFinalClassifier.
/**
* Based on the other layer, predict features for the training document
*
* @param aTemplate
* the template.
* @param aRepository
* the repository.
* @return the prediction.
* @throws UIMAException
* hum?
* @throws ClassNotFoundException
* hum?
* @throws IOException
* hum?
* @throws AnnotationException
* hum?
*
* @throws AutomationException
* if an error occurs.
*/
public static String generateFinalClassifier(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
int frequency = 2;
double sigma = 1;
int iterations = 10;
int beamSize = 0;
boolean maxPosteriors = false;
AnnotationFeature layerFeature = aTemplate.getTrainFeature();
List<List<String>> predictions = new ArrayList<>();
File miraDir = aAutomationService.getMiraDir(layerFeature);
Mira mira = new Mira();
File predFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.ft");
File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
boolean trainingDocumentUpdated = false;
// A. training document for other train layers were changed
for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
trainingDocumentUpdated = true;
break;
}
}
}
// B. Training document for the main training layer were changed
for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(layerFeature))) {
trainingDocumentUpdated = true;
break;
}
}
// C. New Curation document arrives
for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
if (document.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
trainingDocumentUpdated = true;
break;
}
}
// D. tab-sep training documents
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(layerFeature)) {
trainingDocumentUpdated = true;
break;
}
}
if (!trainingDocumentUpdated) {
return aTemplate.getResult();
}
// if no other layer is used, use this as main train document,
// otherwise, add all the
// predictions and modify template
File baseTrainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.base");
File trainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train");
// generate final classifier, using all features generated
String trainName = trainFile.getAbsolutePath();
String finalClassifierModelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFile, predcitedFile, null);
getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFile, predcitedFile);
generateTrainDocument(aTemplate, aRepository, aCurationDocumentService, aAnnotationService, aAutomationService, aUserDao, false);
String trainTemplate;
if (predictions.size() == 0) {
trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
FileUtils.copyFile(baseTrainFile, trainFile);
} else {
trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
buildTrainFile(baseTrainFile, trainFile, predictions);
}
boolean randomInit = false;
if (!layerFeature.getLayer().isLockToTokenOffset()) {
mira.setIobScorer();
}
mira.loadTemplates(trainTemplate);
mira.setClip(sigma);
mira.maxPosteriors = maxPosteriors;
mira.beamSize = beamSize;
int numExamples = mira.count(trainName, frequency);
mira.initModel(randomInit);
String trainResult = "";
for (int i = 0; i < iterations; i++) {
trainResult = mira.train(trainName, iterations, numExamples, i);
mira.averageWeights(iterations * numExamples);
}
mira.saveModel(finalClassifierModelName);
// all training documents are processed by now
for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
document.setProcessed(true);
}
for (TrainingDocument document : aAutomationService.listTabSepDocuments(layerFeature.getProject())) {
document.setProcessed(true);
}
return trainResult;
}
use of edu.lium.mira.Mira in project webanno by webanno.
the class AutomationUtil method addOtherFeatureToPredictDocument.
/**
* Based on the other layer, add features for the prediction document
*
* @param aTemplate
* the template.
* @param aRepository
* the repository.
* @throws UIMAException
* hum?
* @throws ClassNotFoundException
* hum?
* @throws IOException
* hum?
* @throws AnnotationException
* hum?
* @throws AutomationException
* hum?
*/
public static void addOtherFeatureToPredictDocument(MiraTemplate aTemplate, DocumentService aRepository, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
AnnotationFeature layerFeature = aTemplate.getTrainFeature();
File miraDir = aAutomationService.getMiraDir(layerFeature);
for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
List<List<String>> predictions = new ArrayList<>();
File predFtFile = new File(miraDir, document.getId() + ".pred.ft");
Mira mira = new Mira();
int beamSize = 0;
boolean maxPosteriors = false;
File predcitedFile = new File(predFtFile.getAbsolutePath() + "-pred");
getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFtFile, predcitedFile, document);
getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFtFile, predcitedFile);
File basePredFile = new File(miraDir, document.getId() + ".pred");
if (predictions.size() == 0) {
createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
FileUtils.copyFile(predFtFile, basePredFile);
} else {
createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
buildPredictFile(predFtFile, basePredFile, predictions, aTemplate.getTrainFeature());
}
}
}
use of edu.lium.mira.Mira in project webanno by webanno.
the class AutomationUtil method otherFeatureClassifiers.
/**
* When additional layers are used as training feature, the training document should be
* auto-predicted with the other layers. Example, if the train layer is Named Entity and POS
* layer is used as additional feature, the training document should be predicted using the POS
* layer documents for POS annotation
*
* @param aTemplate
* the template.
* @param aRepository
* the repository.
* @throws IOException
* hum?
* @throws ClassNotFoundException
* hum?
*/
public static void otherFeatureClassifiers(MiraTemplate aTemplate, DocumentService aRepository, AutomationService aAutomationService) throws IOException, ClassNotFoundException {
Mira mira = new Mira();
int frequency = 2;
double sigma = 1;
int iterations = 10;
int beamSize = 0;
boolean maxPosteriors = false;
String templateName = null;
for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
templateName = createTemplate(feature, getMiraTemplateFile(feature, aAutomationService), 0);
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
File trainFile = new File(miraDir, feature.getId() + ".train");
String initalModelName = "";
String trainName = trainFile.getAbsolutePath();
String modelName = aAutomationService.getMiraModel(feature, true, null).getAbsolutePath();
boolean randomInit = false;
if (!feature.getLayer().isLockToTokenOffset()) {
mira.setIobScorer();
}
mira.loadTemplates(templateName);
mira.setClip(sigma);
mira.maxPosteriors = maxPosteriors;
mira.beamSize = beamSize;
int numExamples = mira.count(trainName, frequency);
mira.initModel(randomInit);
if (!initalModelName.equals("")) {
mira.loadModel(initalModelName);
}
for (int i = 0; i < iterations; i++) {
mira.train(trainName, iterations, numExamples, i);
mira.averageWeights(iterations * numExamples);
}
mira.saveModel(modelName);
}
}
use of edu.lium.mira.Mira in project webanno by webanno.
the class AutomationUtil method predict.
public static void predict(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException {
AnnotationFeature layerFeature = aTemplate.getTrainFeature();
File miraDir = aAutomationService.getMiraDir(layerFeature);
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
File predFile = new File(miraDir, document.getId() + ".pred");
Mira mira = new Mira();
int shiftColumns = 0;
int nbest = 1;
int beamSize = 0;
boolean maxPosteriors = false;
String modelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
String testName = predFile.getAbsolutePath();
File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
PrintStream stream = new PrintStream(predcitedFile);
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
if (testName != null) {
input = new BufferedReader(new FileReader(testName));
}
mira.loadModel(modelName);
mira.setShiftColumns(shiftColumns);
mira.nbest = nbest;
mira.beamSize = beamSize;
mira.maxPosteriors = maxPosteriors;
mira.test(input, stream);
LOG.info("Prediction is wrtten to a MIRA File. To be done is writing back to the CAS");
LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
List<String> annotations = new ArrayList<>();
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
continue;
}
StringTokenizer st = new StringTokenizer(line, " ");
String tag = "";
while (st.hasMoreTokens()) {
tag = st.nextToken();
}
annotations.add(tag);
}
LOG.info(annotations.size() + " Predictions found to be written to the CAS");
JCas jCas = null;
User user = aUserDao.getCurrentUser();
try {
AnnotationDocument annoDocument = aRepository.getAnnotationDocument(document, user);
jCas = aRepository.readAnnotationCas(annoDocument);
automate(jCas, layerFeature, annotations);
} catch (DataRetrievalFailureException e) {
automate(jCas, layerFeature, annotations);
LOG.info("Predictions found are written to the CAS");
aCorrectionDocumentService.writeCorrectionCas(jCas, document);
status.setAnnoDocs(status.getAnnoDocs() - 1);
}
automate(jCas, layerFeature, annotations);
LOG.info("Predictions found are written to the CAS");
aCorrectionDocumentService.writeCorrectionCas(jCas, document);
status.setAnnoDocs(status.getAnnoDocs() - 1);
}
}
use of edu.lium.mira.Mira in project webanno by webanno.
the class AutomationUtil method tabSepClassifiers.
/**
* Classifier for an external tab-sep file (token TAB feature)
*
* @param aTemplate
* the template.
* @throws IOException
* hum?
* @throws ClassNotFoundException
* hum?
*/
public static void tabSepClassifiers(MiraTemplate aTemplate, AutomationService aAutomationService) throws IOException, ClassNotFoundException {
Mira mira = new Mira();
int frequency = 2;
double sigma = 1;
int iterations = 10;
int beamSize = 0;
boolean maxPosteriors = false;
String templateName = null;
boolean documentChanged = false;
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed()) {
documentChanged = true;
break;
}
}
if (!documentChanged) {
return;
}
for (TrainingDocument trainingDocument : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (trainingDocument.getFeature() != null) {
// This is a target layer train document
continue;
}
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
File trainFile = new File(miraDir, trainingDocument.getId() + trainingDocument.getProject().getId() + ".train");
templateName = createTemplate(null, getMiraTemplateFile(aTemplate.getTrainFeature(), aAutomationService), 0);
String initalModelName = "";
String trainName = trainFile.getAbsolutePath();
String modelName = aAutomationService.getMiraModel(aTemplate.getTrainFeature(), true, trainingDocument).getAbsolutePath();
boolean randomInit = false;
mira.loadTemplates(templateName);
mira.setClip(sigma);
mira.maxPosteriors = maxPosteriors;
mira.beamSize = beamSize;
int numExamples = mira.count(trainName, frequency);
mira.initModel(randomInit);
if (!initalModelName.equals("")) {
mira.loadModel(initalModelName);
}
for (int i = 0; i < iterations; i++) {
mira.train(trainName, iterations, numExamples, i);
mira.averageWeights(iterations * numExamples);
}
mira.saveModel(modelName);
}
}
Aggregations