use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.
the class ExportUtil method exportProjectSettings.
public static de.tudarmstadt.ukp.clarin.webanno.export.model.Project exportProjectSettings(AnnotationSchemaService annotationService, Optional<AutomationService> automationService, DocumentService documentService, ProjectService projectService, Project aProject, File aProjectSettings, File aExportTempDir) {
de.tudarmstadt.ukp.clarin.webanno.export.model.Project exProjekt = new de.tudarmstadt.ukp.clarin.webanno.export.model.Project();
exProjekt.setDescription(aProject.getDescription());
exProjekt.setName(aProject.getName());
// In older versions of WebAnno, the mode was an enum which was serialized as upper-case
// during export but as lower-case in the database. This is compensating for this case.
exProjekt.setMode(StringUtils.upperCase(aProject.getMode(), Locale.US));
exProjekt.setScriptDirection(aProject.getScriptDirection());
exProjekt.setVersion(aProject.getVersion());
exProjekt.setDisableExport(aProject.isDisableExport());
exProjekt.setCreated(aProject.getCreated());
exProjekt.setUpdated(aProject.getUpdated());
List<de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationLayer> exLayers = new ArrayList<>();
// Store map of layer and its equivalent exLayer so that the attach type is attached later
Map<AnnotationLayer, de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationLayer> layerToExLayers = new HashMap<>();
// Store map of feature and its equivalent exFeature so that the attach feature is attached
// later
Map<AnnotationFeature, de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature> featureToExFeatures = new HashMap<>();
for (AnnotationLayer layer : annotationService.listAnnotationLayer(aProject)) {
exLayers.add(ImportUtil.exportLayerDetails(layerToExLayers, featureToExFeatures, layer, annotationService));
}
// exported feature
for (AnnotationLayer layer : layerToExLayers.keySet()) {
if (layer.getAttachType() != null) {
layerToExLayers.get(layer).setAttachType(layerToExLayers.get(layer.getAttachType()));
}
if (layer.getAttachFeature() != null) {
layerToExLayers.get(layer).setAttachFeature(featureToExFeatures.get(layer.getAttachFeature()));
}
}
exProjekt.setLayers(exLayers);
List<ExportedTagSet> extTagSets = new ArrayList<>();
for (TagSet tagSet : annotationService.listTagSets(aProject)) {
ExportedTagSet exTagSet = new ExportedTagSet();
exTagSet.setCreateTag(tagSet.isCreateTag());
exTagSet.setDescription(tagSet.getDescription());
exTagSet.setLanguage(tagSet.getLanguage());
exTagSet.setName(tagSet.getName());
List<ExportedTag> exTags = new ArrayList<>();
for (Tag tag : annotationService.listTags(tagSet)) {
ExportedTag exTag = new ExportedTag();
exTag.setDescription(tag.getDescription());
exTag.setName(tag.getName());
exTags.add(exTag);
}
exTagSet.setTags(exTags);
extTagSets.add(exTagSet);
}
exProjekt.setTagSets(extTagSets);
List<SourceDocument> sourceDocuments = new ArrayList<>();
List<AnnotationDocument> annotationDocuments = new ArrayList<>();
// add source documents to a project
List<de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument> documents = documentService.listSourceDocuments(aProject);
for (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument sourceDocument : documents) {
SourceDocument exDocument = new SourceDocument();
exDocument.setFormat(sourceDocument.getFormat());
exDocument.setName(sourceDocument.getName());
exDocument.setState(sourceDocument.getState());
exDocument.setTimestamp(sourceDocument.getTimestamp());
exDocument.setSentenceAccessed(sourceDocument.getSentenceAccessed());
exDocument.setCreated(sourceDocument.getCreated());
exDocument.setUpdated(sourceDocument.getUpdated());
// add annotation document to Project
for (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument annotationDocument : documentService.listAnnotationDocuments(sourceDocument)) {
AnnotationDocument annotationDocumentToExport = new AnnotationDocument();
annotationDocumentToExport.setName(annotationDocument.getName());
annotationDocumentToExport.setState(annotationDocument.getState());
annotationDocumentToExport.setUser(annotationDocument.getUser());
annotationDocumentToExport.setTimestamp(annotationDocument.getTimestamp());
annotationDocumentToExport.setSentenceAccessed(annotationDocument.getSentenceAccessed());
annotationDocumentToExport.setCreated(annotationDocument.getCreated());
annotationDocumentToExport.setUpdated(annotationDocument.getUpdated());
annotationDocuments.add(annotationDocumentToExport);
}
sourceDocuments.add(exDocument);
}
exProjekt.setSourceDocuments(sourceDocuments);
exProjekt.setAnnotationDocuments(annotationDocuments);
if (automationService.isPresent()) {
List<de.tudarmstadt.ukp.clarin.webanno.export.model.TrainingDocument> trainDocuments = new ArrayList<>();
List<TrainingDocument> trainingDocuments = automationService.get().listTrainingDocuments(aProject);
Map<String, de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature> fm = new HashMap<>();
for (de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature f : featureToExFeatures.values()) {
fm.put(f.getName(), f);
}
for (TrainingDocument trainingDocument : trainingDocuments) {
de.tudarmstadt.ukp.clarin.webanno.export.model.TrainingDocument exDocument = new de.tudarmstadt.ukp.clarin.webanno.export.model.TrainingDocument();
exDocument.setFormat(trainingDocument.getFormat());
exDocument.setName(trainingDocument.getName());
exDocument.setState(trainingDocument.getState());
exDocument.setTimestamp(trainingDocument.getTimestamp());
exDocument.setSentenceAccessed(trainingDocument.getSentenceAccessed());
if (trainingDocument.getFeature() != null) {
exDocument.setFeature(fm.get(trainingDocument.getFeature().getName()));
}
trainDocuments.add(exDocument);
}
exProjekt.setTrainingDocuments(trainDocuments);
} else {
exProjekt.setTrainingDocuments(new ArrayList<>());
}
List<ProjectPermission> projectPermissions = new ArrayList<>();
// add project permissions to the project
for (User user : projectService.listProjectUsersWithPermissions(aProject)) {
for (de.tudarmstadt.ukp.clarin.webanno.model.ProjectPermission permission : projectService.listProjectPermissionLevel(user, aProject)) {
ProjectPermission permissionToExport = new ProjectPermission();
permissionToExport.setLevel(permission.getLevel());
permissionToExport.setUser(user.getUsername());
projectPermissions.add(permissionToExport);
}
}
exProjekt.setProjectPermissions(projectPermissions);
// export automation Mira template
if (automationService.isPresent()) {
List<de.tudarmstadt.ukp.clarin.webanno.export.model.MiraTemplate> exTemplates = new ArrayList<>();
for (MiraTemplate template : automationService.get().listMiraTemplates(aProject)) {
de.tudarmstadt.ukp.clarin.webanno.export.model.MiraTemplate exTemplate = new de.tudarmstadt.ukp.clarin.webanno.export.model.MiraTemplate();
exTemplate.setAnnotateAndPredict(template.isAnnotateAndRepeat());
exTemplate.setAutomationStarted(template.isAutomationStarted());
exTemplate.setCurrentLayer(template.isCurrentLayer());
exTemplate.setResult(template.getResult());
exTemplate.setTrainFeature(featureToExFeatures.get(template.getTrainFeature()));
if (template.getOtherFeatures().size() > 0) {
Set<de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature> exOtherFeatures = new HashSet<>();
for (AnnotationFeature feature : template.getOtherFeatures()) {
exOtherFeatures.add(featureToExFeatures.get(feature));
}
exTemplate.setOtherFeatures(exOtherFeatures);
}
exTemplates.add(exTemplate);
}
exProjekt.setMiraTemplates(exTemplates);
} else {
exProjekt.setMiraTemplates(new ArrayList<>());
}
return exProjekt;
}
use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.
the class AutomationUtil method generateFinalClassifier.
/**
* Based on the other layer, predict features for the training document
*
* @param aTemplate
* the template.
* @param aRepository
* the repository.
* @return the prediction.
* @throws UIMAException
* hum?
* @throws ClassNotFoundException
* hum?
* @throws IOException
* hum?
* @throws AnnotationException
* hum?
*
* @throws AutomationException
* if an error occurs.
*/
public static String generateFinalClassifier(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
int frequency = 2;
double sigma = 1;
int iterations = 10;
int beamSize = 0;
boolean maxPosteriors = false;
AnnotationFeature layerFeature = aTemplate.getTrainFeature();
List<List<String>> predictions = new ArrayList<>();
File miraDir = aAutomationService.getMiraDir(layerFeature);
Mira mira = new Mira();
File predFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.ft");
File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
boolean trainingDocumentUpdated = false;
// A. training document for other train layers were changed
for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
trainingDocumentUpdated = true;
break;
}
}
}
// B. Training document for the main training layer were changed
for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(layerFeature))) {
trainingDocumentUpdated = true;
break;
}
}
// C. New Curation document arrives
for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
if (document.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
trainingDocumentUpdated = true;
break;
}
}
// D. tab-sep training documents
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(layerFeature)) {
trainingDocumentUpdated = true;
break;
}
}
if (!trainingDocumentUpdated) {
return aTemplate.getResult();
}
// if no other layer is used, use this as main train document,
// otherwise, add all the
// predictions and modify template
File baseTrainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.base");
File trainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train");
// generate final classifier, using all features generated
String trainName = trainFile.getAbsolutePath();
String finalClassifierModelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFile, predcitedFile, null);
getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFile, predcitedFile);
generateTrainDocument(aTemplate, aRepository, aCurationDocumentService, aAnnotationService, aAutomationService, aUserDao, false);
String trainTemplate;
if (predictions.size() == 0) {
trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
FileUtils.copyFile(baseTrainFile, trainFile);
} else {
trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
buildTrainFile(baseTrainFile, trainFile, predictions);
}
boolean randomInit = false;
if (!layerFeature.getLayer().isLockToTokenOffset()) {
mira.setIobScorer();
}
mira.loadTemplates(trainTemplate);
mira.setClip(sigma);
mira.maxPosteriors = maxPosteriors;
mira.beamSize = beamSize;
int numExamples = mira.count(trainName, frequency);
mira.initModel(randomInit);
String trainResult = "";
for (int i = 0; i < iterations; i++) {
trainResult = mira.train(trainName, iterations, numExamples, i);
mira.averageWeights(iterations * numExamples);
}
mira.saveModel(finalClassifierModelName);
// all training documents are processed by now
for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
document.setProcessed(true);
}
for (TrainingDocument document : aAutomationService.listTabSepDocuments(layerFeature.getProject())) {
document.setProcessed(true);
}
return trainResult;
}
use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.
the class AutomationUtil method generateTrainDocument.
public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
LOG.info("Starting to generate training document");
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
if (!miraDir.exists()) {
FileUtils.forceMkdir(miraDir);
}
AnnotationFeature feature = aTemplate.getTrainFeature();
boolean documentChanged = false;
// A. training document for other train layers were changed
for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
documentChanged = true;
break;
}
}
}
// B. Training document for the main training layer were changed
for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
documentChanged = true;
break;
}
}
// C. New Curation document arrives
if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
documentChanged = true;
}
// D. tab-sep training documents
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
documentChanged = true;
break;
}
}
if (!documentChanged) {
return;
}
File trainFile;
if (aBase) {
trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
} else {
trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
}
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
// Training documents (Curated or webanno-compatible imported ones - read using UIMA)
List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
int trainingDocsCount = 0;
for (TrainingDocument trainingDocument : trainingDocuments) {
if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
for (Sentence sentence : select(jCas, Sentence.class)) {
if (aBase) {
// base training document
trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
} else {
// training document with other features
trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
}
}
trainingDocument.setProcessed(!aBase);
if (!aBase) {
status.setTrainDocs(status.getTrainDocs() - 1);
}
}
}
// for curated docuemnts
List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
for (SourceDocument sourceDocument : sourceDocuments) {
if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
for (Sentence sentence : select(jCas, Sentence.class)) {
if (aBase) {
// base training document
trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
} else {
// training document with other features
trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
}
}
if (!aBase) {
status.setTrainDocs(status.getTrainDocs() - 1);
}
}
trainingDocsCount++;
LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
}
// Tab-sep documents to be used as a target layer train document
int goldStandardDocsCounter = 0;
List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
for (TrainingDocument document : goldStandardDocs) {
if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
trainOut.append("\n");
} else {
StringTokenizer st = new StringTokenizer(line, "\t");
if (st.countTokens() != 2) {
trainOut.close();
throw new AutomationException("This is not a valid TAB-SEP document");
}
if (aBase) {
trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
} else {
trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
}
}
}
}
goldStandardDocsCounter++;
LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
}
trainOut.close();
LOG.info("Completed generating training document");
}
use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.
the class AutomationUtil method getFeaturesTabSep.
private static void getFeaturesTabSep(MiraTemplate aTemplate, AutomationService aAutomationService, int beamSize, boolean maxPosteriors, AnnotationFeature layerFeature, List<List<String>> predictions, Mira mira, File predFile, File predcitedFile) throws IOException, ClassNotFoundException, AutomationException {
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
int shiftColumns = 0;
int nbest = 1;
String modelName = aAutomationService.getMiraModel(layerFeature, true, document).getAbsolutePath();
if (!new File(modelName).exists()) {
continue;
}
String testName = predFile.getAbsolutePath();
PrintStream stream = new PrintStream(predcitedFile);
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
if (testName != null) {
input = new BufferedReader(new FileReader(testName));
}
mira.loadModel(modelName);
mira.setShiftColumns(shiftColumns);
mira.nbest = nbest;
mira.beamSize = beamSize;
mira.maxPosteriors = maxPosteriors;
try {
mira.test(input, stream);
} catch (Exception e) {
throw new AutomationException(document.getName() + " is Invalid TAB-SEP file!");
}
LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
List<String> annotations = new ArrayList<>();
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
continue;
}
StringTokenizer st = new StringTokenizer(line, " ");
String tag = "";
while (st.hasMoreTokens()) {
tag = st.nextToken();
}
annotations.add(tag);
}
predictions.add(annotations);
}
}
use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.
the class AutomationUtil method addTabSepTrainDocument.
public static void addTabSepTrainDocument(MiraTemplate aTemplate, AutomationService aAutomationService) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
if (!miraDir.exists()) {
FileUtils.forceMkdir(miraDir);
}
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
boolean documentChanged = false;
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed()) {
documentChanged = true;
break;
}
}
if (!documentChanged) {
return;
}
for (TrainingDocument trainingDocument : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (trainingDocument.getFeature() != null) {
// This is a target layer train document
continue;
}
File trainFile = new File(miraDir, trainingDocument.getId() + trainingDocument.getProject().getId() + ".train");
BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
File tabSepFile = new File(aAutomationService.getDocumentFolder(trainingDocument), trainingDocument.getName());
LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
trainOut.append("\n");
} else {
StringTokenizer st = new StringTokenizer(line, "\t");
if (st.countTokens() != 2) {
trainOut.close();
throw new AutomationException("This is not a valid TAB-SEP document");
}
trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
}
}
trainingDocument.setProcessed(false);
status.setTrainDocs(status.getTrainDocs() - 1);
trainOut.close();
}
}
Aggregations