use of com.joliciel.talismane.lexicon.WritableLexicalEntry in project talismane by joliciel-informatique.
the class TokenPerLineCorpusReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
while (sentenceLines == null) {
List<UnprocessedLine> lines = new ArrayList<>();
int skippedLineCount = 0;
if (!this.hasNextLine())
break;
while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
String line = "";
if (this.hasNextLine())
line = this.nextLine().replace("\r", "");
lineNumber++;
if (LOG.isTraceEnabled())
LOG.trace("Line " + lineNumber + ": " + line);
if (line.length() > 0) {
boolean skip = false;
for (Pattern skipLinePattern : skipLinePatterns) {
if (skipLinePattern.matcher(line).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
skip = true;
skippedLineCount++;
break;
}
}
List<CorpusSentenceRule> myRules = new ArrayList<>();
List<Matcher> myMatchers = new ArrayList<>();
for (CorpusSentenceRule sentenceRule : sentenceRules) {
Matcher matcher = sentenceRule.getPattern().matcher(line);
if (matcher.matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Matched rule: " + sentenceRule);
myRules.add(sentenceRule);
myMatchers.add(matcher);
}
}
UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
lines.add(unprocessedLine);
} else {
if (lines.size() == 0 || lines.size() == skippedLineCount) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
// end of sentence
boolean includeMe = true;
// check cross-validation
if (this.getCrossValidationSize() > 0) {
if (this.getIncludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
includeMe = false;
}
} else if (this.getExcludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
includeMe = false;
}
}
}
if (this.getStartSentence() > sentenceCount) {
includeMe = false;
}
sentenceCount++;
LOG.debug("sentenceCount: " + sentenceCount);
if (!includeMe) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
sentenceLines = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (!unprocessedLine.skip) {
CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
sentenceLines.add(corpusLine);
if (this.lexicalEntryReader != null) {
WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
corpusLine.setLexicalEntry(lexicalEntry);
}
}
}
List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (LOG.isTraceEnabled())
LOG.trace("Line " + unprocessedLine);
for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
Matcher matcher = unprocessedLine.matchers.get(i);
if (LOG.isTraceEnabled())
LOG.trace("Testing rule " + sentenceRule);
CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
if (LOG.isTraceEnabled())
LOG.trace("Result: " + action);
if (action != null) {
if (action instanceof MergeAction)
mergeActions.add((MergeAction) action);
break;
}
}
}
if (mergeActions.size() > 0) {
List<CorpusLine> newSentenceLines = new ArrayList<>();
Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
}
}
int i = 1;
Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
int nextIndexToMerge = iIndexToMerge.next();
int linesRemoved = 0;
Map<Integer, Integer> indexChangeMap = new HashMap<>();
indexChangeMap.put(0, 0);
for (CorpusLine corpusLine : sentenceLines) {
if (i == nextIndexToMerge) {
MergeAction mergeAction = indexesToMerge.get(i);
if (i == mergeAction.getFirstIndex()) {
newSentenceLines.add(mergeAction.getMergedLine());
linesRemoved -= 1;
}
linesRemoved += 1;
if (iIndexToMerge.hasNext())
nextIndexToMerge = iIndexToMerge.next();
else
nextIndexToMerge = -1;
} else {
newSentenceLines.add(corpusLine);
}
indexChangeMap.put(i, i - linesRemoved);
i++;
}
for (CorpusLine corpusLine : newSentenceLines) {
corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
int governorIndex = corpusLine.getGovernorIndex();
if (governorIndex >= 0)
corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
if (nonProjGovernorIndex >= 0)
corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
}
sentenceLines = newSentenceLines;
}
Sentence sentence = null;
if (sentenceReader != null && sentenceReader.hasNextSentence()) {
sentence = sentenceReader.nextSentence();
} else {
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new TalismaneException("Linguistic rules have not been set.");
String text = "";
for (CorpusLine corpusLine : sentenceLines) {
String word = corpusLine.getElement(CorpusElement.TOKEN);
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
sentence = new Sentence(text, currentFile, sessionId);
}
for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
sentenceAnnotator.annotate(sentence);
}
this.processSentence(sentence, sentenceLines);
}
}
}
}
return (sentenceLines != null);
}
use of com.joliciel.talismane.lexicon.WritableLexicalEntry in project talismane by joliciel-informatique.
the class CorpusLineReader method read.
/**
* Read one line out of the corpus, and transform it into a {@link CorpusLine}
*
* @param line
* the line to read
* @param lineNumber
* the line number we reached, starting at 1.
* @throws TalismaneException
* if the regex wasn't matched on a given line
*/
public CorpusLine read(String line, int lineNumber) throws TalismaneException {
Matcher matcher = this.pattern.matcher(line);
if (!matcher.matches())
throw new TalismaneException("Didn't match pattern \"" + regex + "\". Compiled to: \"" + this.pattern.pattern() + "\". On line " + lineNumber + ": " + line);
CorpusLine corpusLine = new CorpusLine(line, lineNumber);
for (CorpusElement elementType : CorpusElement.values()) {
if (placeholderIndexMap.containsKey(elementType)) {
String value = matcher.group(placeholderIndexMap.get(elementType));
switch(elementType) {
case TOKEN:
case LEMMA:
value = TalismaneSession.get(sessionId).getCoNLLFormatter().fromCoNLL(value);
break;
default:
if ("_".equals(value))
value = "";
break;
}
corpusLine.setElement(elementType, value);
}
}
if (this.lexicalEntryReader != null) {
WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
this.lexicalEntryReader.readEntry(line, lexicalEntry);
corpusLine.setLexicalEntry(lexicalEntry);
}
Map<CorpusElement, String> updateValues = new HashMap<>();
for (CorpusRule corpusRule : corpusRules) {
corpusRule.apply(corpusLine, updateValues);
}
for (CorpusElement element : updateValues.keySet()) {
String value = updateValues.get(element);
if (LOG.isTraceEnabled()) {
LOG.trace("On line " + lineNumber + ", updating " + element.name() + " from '" + corpusLine.getElement(element) + "' to '" + value + "'");
}
corpusLine.setElement(element, value);
}
return corpusLine;
}
Aggregations