use of com.joliciel.talismane.LinguisticRules in project talismane by joliciel-informatique.
the class TokenPerLineCorpusReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
while (sentenceLines == null) {
List<UnprocessedLine> lines = new ArrayList<>();
int skippedLineCount = 0;
if (!this.hasNextLine())
break;
while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
String line = "";
if (this.hasNextLine())
line = this.nextLine().replace("\r", "");
lineNumber++;
if (LOG.isTraceEnabled())
LOG.trace("Line " + lineNumber + ": " + line);
if (line.length() > 0) {
boolean skip = false;
for (Pattern skipLinePattern : skipLinePatterns) {
if (skipLinePattern.matcher(line).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
skip = true;
skippedLineCount++;
break;
}
}
List<CorpusSentenceRule> myRules = new ArrayList<>();
List<Matcher> myMatchers = new ArrayList<>();
for (CorpusSentenceRule sentenceRule : sentenceRules) {
Matcher matcher = sentenceRule.getPattern().matcher(line);
if (matcher.matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Matched rule: " + sentenceRule);
myRules.add(sentenceRule);
myMatchers.add(matcher);
}
}
UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
lines.add(unprocessedLine);
} else {
if (lines.size() == 0 || lines.size() == skippedLineCount) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
// end of sentence
boolean includeMe = true;
// check cross-validation
if (this.getCrossValidationSize() > 0) {
if (this.getIncludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
includeMe = false;
}
} else if (this.getExcludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
includeMe = false;
}
}
}
if (this.getStartSentence() > sentenceCount) {
includeMe = false;
}
sentenceCount++;
LOG.debug("sentenceCount: " + sentenceCount);
if (!includeMe) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
sentenceLines = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (!unprocessedLine.skip) {
CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
sentenceLines.add(corpusLine);
if (this.lexicalEntryReader != null) {
WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
corpusLine.setLexicalEntry(lexicalEntry);
}
}
}
List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (LOG.isTraceEnabled())
LOG.trace("Line " + unprocessedLine);
for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
Matcher matcher = unprocessedLine.matchers.get(i);
if (LOG.isTraceEnabled())
LOG.trace("Testing rule " + sentenceRule);
CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
if (LOG.isTraceEnabled())
LOG.trace("Result: " + action);
if (action != null) {
if (action instanceof MergeAction)
mergeActions.add((MergeAction) action);
break;
}
}
}
if (mergeActions.size() > 0) {
List<CorpusLine> newSentenceLines = new ArrayList<>();
Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
}
}
int i = 1;
Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
int nextIndexToMerge = iIndexToMerge.next();
int linesRemoved = 0;
Map<Integer, Integer> indexChangeMap = new HashMap<>();
indexChangeMap.put(0, 0);
for (CorpusLine corpusLine : sentenceLines) {
if (i == nextIndexToMerge) {
MergeAction mergeAction = indexesToMerge.get(i);
if (i == mergeAction.getFirstIndex()) {
newSentenceLines.add(mergeAction.getMergedLine());
linesRemoved -= 1;
}
linesRemoved += 1;
if (iIndexToMerge.hasNext())
nextIndexToMerge = iIndexToMerge.next();
else
nextIndexToMerge = -1;
} else {
newSentenceLines.add(corpusLine);
}
indexChangeMap.put(i, i - linesRemoved);
i++;
}
for (CorpusLine corpusLine : newSentenceLines) {
corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
int governorIndex = corpusLine.getGovernorIndex();
if (governorIndex >= 0)
corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
if (nonProjGovernorIndex >= 0)
corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
}
sentenceLines = newSentenceLines;
}
Sentence sentence = null;
if (sentenceReader != null && sentenceReader.hasNextSentence()) {
sentence = sentenceReader.nextSentence();
} else {
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new TalismaneException("Linguistic rules have not been set.");
String text = "";
for (CorpusLine corpusLine : sentenceLines) {
String word = corpusLine.getElement(CorpusElement.TOKEN);
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
sentence = new Sentence(text, currentFile, sessionId);
}
for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
sentenceAnnotator.annotate(sentence);
}
this.processSentence(sentence, sentenceLines);
}
}
}
}
return (sentenceLines != null);
}
use of com.joliciel.talismane.LinguisticRules in project talismane by joliciel-informatique.
the class StandoffReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
if (configuration == null && sentenceIndex < sentences.size()) {
List<StandoffToken> tokens = sentences.get(sentenceIndex++);
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new RuntimeException("Linguistic rules have not been set.");
String text = "";
for (StandoffToken standoffToken : tokens) {
String word = standoffToken.text;
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
Sentence sentence = new Sentence(text, sessionId);
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
}
PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
for (StandoffToken standoffToken : tokens) {
Token token = tokenSequence.addToken(standoffToken.text);
Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
if (LOG.isTraceEnabled()) {
LOG.trace(posTaggedToken.toString());
}
posTaggedToken.setComment(standoffToken.comment);
posTagSequence.addPosTaggedToken(posTaggedToken);
idTokenMap.put(standoffToken.id, posTaggedToken);
LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
}
tokenSequence.setWithRoot(true);
configuration = new ParseConfiguration(posTagSequence);
for (StandoffToken standoffToken : tokens) {
StandoffRelation relation = relationMap.get(standoffToken.id);
if (relation != null) {
PosTaggedToken head = idTokenMap.get(relation.fromToken);
PosTaggedToken dependent = idTokenMap.get(relation.toToken);
if (head == null) {
throw new TalismaneException("No token found for head id: " + relation.fromToken);
}
if (dependent == null) {
throw new TalismaneException("No token found for dependent id: " + relation.toToken);
}
DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
arc.setComment(relation.comment);
} else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
if (punctuationDepLabel != null) {
PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
for (int i = dependent.getIndex() - 1; i >= 0; i--) {
PosTaggedToken head = posTagSequence.get(i);
if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
continue;
configuration.addDependency(head, dependent, punctuationDepLabel, null);
break;
}
}
}
}
}
}
return (configuration != null);
}
Aggregations