use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.
the class RemoteApiController2 method createCompatibleCas.
private JCas createCompatibleCas(long aProjectId, long aDocumentId, MultipartFile aFile, Optional<String> aFormat) throws RemoteApiException, ClassNotFoundException, IOException, UIMAException {
Project project = getProject(aProjectId);
SourceDocument document = getDocument(project, aDocumentId);
// Check if the format is supported
String format = aFormat.orElse(FORMAT_DEFAULT);
Map<String, Class<CollectionReader>> readableFormats = importExportService.getReadableFormats();
if (readableFormats.get(format) == null) {
throw new UnsupportedFormatException("Format [%s] not supported. Acceptable formats are %s.", format, readableFormats.keySet());
}
// Convert the uploaded annotation document into a CAS
File tmpFile = null;
JCas annotationCas;
try {
tmpFile = File.createTempFile("upload", ".bin");
aFile.transferTo(tmpFile);
annotationCas = importExportService.importCasFromFile(tmpFile, project, format);
} finally {
if (tmpFile != null) {
FileUtils.forceDelete(tmpFile);
}
}
// Check if the uploaded file is compatible with the source document. They are compatible
// if the text is the same and if all the token and sentence annotations have the same
// offsets.
JCas initialCas = documentService.createOrReadInitialCas(document);
String initialText = initialCas.getDocumentText();
String annotationText = annotationCas.getDocumentText();
// If any of the texts contains tailing line breaks, we ignore that. We assume at the moment
// that nobody will have created annotations over that trailing line breaks.
initialText = StringUtils.chomp(initialText);
annotationText = StringUtils.chomp(annotationText);
if (ObjectUtils.notEqual(initialText, annotationText)) {
int diffIndex = StringUtils.indexOfDifference(initialText, annotationText);
String expected = initialText.substring(diffIndex, Math.min(initialText.length(), diffIndex + 20));
String actual = annotationText.substring(diffIndex, Math.min(annotationText.length(), diffIndex + 20));
throw new IncompatibleDocumentException("Text of annotation document does not match text of source document at offset " + "[%d]. Expected [%s] but found [%s].", diffIndex, expected, actual);
}
// Just in case we really had to chomp off a trailing line break from the annotation CAS,
// make sure we copy over the proper text from the initial CAS
// NOT AT HOME THIS YOU SHOULD TRY
// SETTING THE SOFA STRING FORCEFULLY FOLLOWING THE DARK SIDE IS!
forceSetFeatureValue(annotationCas.getSofa(), CAS.FEATURE_BASE_NAME_SOFASTRING, initialCas.getDocumentText());
FSUtil.setFeature(annotationCas.getDocumentAnnotationFs(), CAS.FEATURE_BASE_NAME_END, initialCas.getDocumentText().length());
Collection<Sentence> annotationSentences = select(annotationCas, Sentence.class);
Collection<Sentence> initialSentences = select(initialCas, Sentence.class);
if (annotationSentences.size() != initialSentences.size()) {
throw new IncompatibleDocumentException("Expected [%d] sentences, but annotation document contains [%d] sentences.", initialSentences.size(), annotationSentences.size());
}
assertCompatibleOffsets(initialSentences, annotationSentences);
Collection<Token> annotationTokens = select(annotationCas, Token.class);
Collection<Token> initialTokens = select(initialCas, Token.class);
if (annotationTokens.size() != initialTokens.size()) {
throw new IncompatibleDocumentException("Expected [%d] sentences, but annotation document contains [%d] sentences.", initialSentences.size(), annotationSentences.size());
}
assertCompatibleOffsets(initialTokens, annotationTokens);
return annotationCas;
}
use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.
the class AutomationUtil method generateFinalClassifier.
/**
* Based on the other layer, predict features for the training document
*
* @param aTemplate
* the template.
* @param aRepository
* the repository.
* @return the prediction.
* @throws UIMAException
* hum?
* @throws ClassNotFoundException
* hum?
* @throws IOException
* hum?
* @throws AnnotationException
* hum?
*
* @throws AutomationException
* if an error occurs.
*/
public static String generateFinalClassifier(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
int frequency = 2;
double sigma = 1;
int iterations = 10;
int beamSize = 0;
boolean maxPosteriors = false;
AnnotationFeature layerFeature = aTemplate.getTrainFeature();
List<List<String>> predictions = new ArrayList<>();
File miraDir = aAutomationService.getMiraDir(layerFeature);
Mira mira = new Mira();
File predFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.ft");
File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
boolean trainingDocumentUpdated = false;
// A. training document for other train layers were changed
for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
trainingDocumentUpdated = true;
break;
}
}
}
// B. Training document for the main training layer were changed
for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(layerFeature))) {
trainingDocumentUpdated = true;
break;
}
}
// C. New Curation document arrives
for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
if (document.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
trainingDocumentUpdated = true;
break;
}
}
// D. tab-sep training documents
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(layerFeature)) {
trainingDocumentUpdated = true;
break;
}
}
if (!trainingDocumentUpdated) {
return aTemplate.getResult();
}
// if no other layer is used, use this as main train document,
// otherwise, add all the
// predictions and modify template
File baseTrainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.base");
File trainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train");
// generate final classifier, using all features generated
String trainName = trainFile.getAbsolutePath();
String finalClassifierModelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFile, predcitedFile, null);
getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFile, predcitedFile);
generateTrainDocument(aTemplate, aRepository, aCurationDocumentService, aAnnotationService, aAutomationService, aUserDao, false);
String trainTemplate;
if (predictions.size() == 0) {
trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
FileUtils.copyFile(baseTrainFile, trainFile);
} else {
trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
buildTrainFile(baseTrainFile, trainFile, predictions);
}
boolean randomInit = false;
if (!layerFeature.getLayer().isLockToTokenOffset()) {
mira.setIobScorer();
}
mira.loadTemplates(trainTemplate);
mira.setClip(sigma);
mira.maxPosteriors = maxPosteriors;
mira.beamSize = beamSize;
int numExamples = mira.count(trainName, frequency);
mira.initModel(randomInit);
String trainResult = "";
for (int i = 0; i < iterations; i++) {
trainResult = mira.train(trainName, iterations, numExamples, i);
mira.averageWeights(iterations * numExamples);
}
mira.saveModel(finalClassifierModelName);
// all training documents are processed by now
for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
document.setProcessed(true);
}
for (TrainingDocument document : aAutomationService.listTabSepDocuments(layerFeature.getProject())) {
document.setProcessed(true);
}
return trainResult;
}
use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.
the class AutomationUtil method addOtherFeatureToPredictDocument.
/**
* Based on the other layer, add features for the prediction document
*
* @param aTemplate
* the template.
* @param aRepository
* the repository.
* @throws UIMAException
* hum?
* @throws ClassNotFoundException
* hum?
* @throws IOException
* hum?
* @throws AnnotationException
* hum?
* @throws AutomationException
* hum?
*/
public static void addOtherFeatureToPredictDocument(MiraTemplate aTemplate, DocumentService aRepository, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
AnnotationFeature layerFeature = aTemplate.getTrainFeature();
File miraDir = aAutomationService.getMiraDir(layerFeature);
for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
List<List<String>> predictions = new ArrayList<>();
File predFtFile = new File(miraDir, document.getId() + ".pred.ft");
Mira mira = new Mira();
int beamSize = 0;
boolean maxPosteriors = false;
File predcitedFile = new File(predFtFile.getAbsolutePath() + "-pred");
getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFtFile, predcitedFile, document);
getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFtFile, predcitedFile);
File basePredFile = new File(miraDir, document.getId() + ".pred");
if (predictions.size() == 0) {
createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
FileUtils.copyFile(predFtFile, basePredFile);
} else {
createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
buildPredictFile(predFtFile, basePredFile, predictions, aTemplate.getTrainFeature());
}
}
}
use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.
the class AutomationUtil method generatePredictDocument.
// TODO: rename to predictDocument
public static void generatePredictDocument(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException {
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
if (!miraDir.exists()) {
FileUtils.forceMkdir(miraDir);
}
User user = aUserDao.getCurrentUser();
AnnotationFeature feature = aTemplate.getTrainFeature();
AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) {
File predFile = new File(miraDir, document.getId() + ".pred.ft");
BufferedWriter predOut = new BufferedWriter(new FileWriter(predFile));
JCas jCas;
try {
jCas = aCorrectionDocumentService.readCorrectionCas(document);
} catch (Exception e) {
AnnotationDocument annoDoc = aRepository.createOrGetAnnotationDocument(document, user);
jCas = aRepository.readAnnotationCas(annoDoc);
}
for (Sentence sentence : select(jCas, Sentence.class)) {
predOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
}
predOut.close();
}
}
use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.
the class AutomationUtil method generateTrainDocument.
public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
LOG.info("Starting to generate training document");
File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
if (!miraDir.exists()) {
FileUtils.forceMkdir(miraDir);
}
AnnotationFeature feature = aTemplate.getTrainFeature();
boolean documentChanged = false;
// A. training document for other train layers were changed
for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
documentChanged = true;
break;
}
}
}
// B. Training document for the main training layer were changed
for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
documentChanged = true;
break;
}
}
// C. New Curation document arrives
if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
documentChanged = true;
}
// D. tab-sep training documents
for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
documentChanged = true;
break;
}
}
if (!documentChanged) {
return;
}
File trainFile;
if (aBase) {
trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
} else {
trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
}
AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
// Training documents (Curated or webanno-compatible imported ones - read using UIMA)
List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
int trainingDocsCount = 0;
for (TrainingDocument trainingDocument : trainingDocuments) {
if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
for (Sentence sentence : select(jCas, Sentence.class)) {
if (aBase) {
// base training document
trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
} else {
// training document with other features
trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
}
}
trainingDocument.setProcessed(!aBase);
if (!aBase) {
status.setTrainDocs(status.getTrainDocs() - 1);
}
}
}
// for curated docuemnts
List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
for (SourceDocument sourceDocument : sourceDocuments) {
if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
for (Sentence sentence : select(jCas, Sentence.class)) {
if (aBase) {
// base training document
trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
} else {
// training document with other features
trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
}
}
if (!aBase) {
status.setTrainDocs(status.getTrainDocs() - 1);
}
}
trainingDocsCount++;
LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
}
// Tab-sep documents to be used as a target layer train document
int goldStandardDocsCounter = 0;
List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
for (TrainingDocument document : goldStandardDocs) {
if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
while (it.hasNext()) {
String line = it.next();
if (line.trim().equals("")) {
trainOut.append("\n");
} else {
StringTokenizer st = new StringTokenizer(line, "\t");
if (st.countTokens() != 2) {
trainOut.close();
throw new AutomationException("This is not a valid TAB-SEP document");
}
if (aBase) {
trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
} else {
trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
}
}
}
}
goldStandardDocsCounter++;
LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
}
trainOut.close();
LOG.info("Completed generating training document");
}
Aggregations