use of com.joliciel.talismane.utils.WeightedOutcome in project talismane by joliciel-informatique.
the class LexicalAttributeFeatureTest method testCheckInternal.
@Test
public void testCheckInternal() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("une dame", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("dame", tokenSequence, 1, "une ".length(), "une dame".length(), sessionId);
Decision decision = new Decision("NC", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> gender = new StringLiteralFeature<>(LexicalAttribute.Gender.name());
LexicalAttributeFeature<PosTaggerContext> feature = new LexicalAttributeFeature<>(addressFunction, gender);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<List<WeightedOutcome<String>>> featureResult = feature.checkInternal(context, env);
List<WeightedOutcome<String>> outcomes = featureResult.getOutcome();
System.out.println(outcomes);
assertEquals("f", outcomes.get(0).getOutcome());
assertEquals(1, outcomes.size());
}
use of com.joliciel.talismane.utils.WeightedOutcome in project talismane by joliciel-informatique.
the class LexicalAttributeFeatureTest method testCheckInternalMultipleEntries.
@Test
public void testCheckInternalMultipleEntries() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("je demande", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("demande", tokenSequence, 1, "je ".length(), "je demande".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
LexicalAttributeFeature<PosTaggerContext> feature = new LexicalAttributeFeature<>(addressFunction, person);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<List<WeightedOutcome<String>>> featureResult = feature.checkInternal(context, env);
List<WeightedOutcome<String>> outcomes = featureResult.getOutcome();
System.out.println(outcomes);
for (WeightedOutcome<String> outcome : outcomes) {
assertTrue("1".equals(outcome.getOutcome()) || "3".equals(outcome.getOutcome()));
}
assertEquals(2, outcomes.size());
}
use of com.joliciel.talismane.utils.WeightedOutcome in project talismane by joliciel-informatique.
the class MaxentDetailedAnalysisWriter method onAnalyse.
/*
* (non-Javadoc)
*
* @see com.joliciel.talismane.maxent.MaxentObserver#onAnalyse(java.util.List,
* java.util.Collection)
*/
@Override
public void onAnalyse(Object event, List<FeatureResult<?>> featureResults, Collection<Decision> outcomes) throws IOException {
Map<String, Double> outcomeTotals = new TreeMap<String, Double>();
double uniformPrior = Math.log(1 / (double) outcomeList.size());
for (String outcome : outcomeList) outcomeTotals.put(outcome, uniformPrior);
writer.append("####### Event: " + event.toString() + "\n");
writer.append("### Feature results:\n");
for (FeatureResult<?> featureResult : featureResults) {
if (featureResult.getOutcome() instanceof List) {
@SuppressWarnings("unchecked") FeatureResult<List<WeightedOutcome<String>>> stringCollectionResult = (FeatureResult<List<WeightedOutcome<String>>>) featureResult;
for (WeightedOutcome<String> stringOutcome : stringCollectionResult.getOutcome()) {
String featureName = featureResult.getTrainingName() + "|" + featureResult.getTrainingOutcome(stringOutcome.getOutcome());
String featureOutcome = stringOutcome.getOutcome();
double value = stringOutcome.getWeight();
this.writeFeatureResult(featureName, featureOutcome, value, outcomeTotals);
}
} else {
double value = 1.0;
if (featureResult.getFeature() instanceof DoubleFeature) {
value = (Double) featureResult.getOutcome();
}
this.writeFeatureResult(featureResult.getTrainingName(), featureResult.getOutcome().toString(), value, outcomeTotals);
}
}
writer.append("### Outcome totals:\n");
writer.append("# Uniform prior: " + uniformPrior + " (=1/" + outcomeList.size() + ")\n");
double grandTotal = 0;
for (String outcome : outcomeList) {
double total = outcomeTotals.get(outcome);
double expTotal = Math.exp(total);
grandTotal += expTotal;
}
writer.append(String.format("%1$-30s", "outcome") + String.format("%1$#15s", "total(log)") + String.format("%1$#15s", "total") + String.format("%1$#15s", "normalised") + "\n");
for (String outcome : outcomeList) {
double total = outcomeTotals.get(outcome);
double expTotal = Math.exp(total);
writer.append(String.format("%1$-30s", outcome) + String.format("%1$#15s", decFormat.format(total)) + String.format("%1$#15s", decFormat.format(expTotal)) + String.format("%1$#15s", decFormat.format(expTotal / grandTotal)) + "\n");
}
writer.append("\n");
Map<String, Double> outcomeWeights = new TreeMap<String, Double>();
for (Decision decision : outcomes) {
outcomeWeights.put(decision.getOutcome(), decision.getProbability());
}
writer.append("### Outcome list:\n");
Set<WeightedOutcome<String>> weightedOutcomes = new TreeSet<WeightedOutcome<String>>();
for (String outcome : outcomeList) {
Double weightObj = outcomeWeights.get(outcome);
double weight = (weightObj == null ? 0.0 : weightObj.doubleValue());
WeightedOutcome<String> weightedOutcome = new WeightedOutcome<String>(outcome, weight);
weightedOutcomes.add(weightedOutcome);
}
for (WeightedOutcome<String> weightedOutcome : weightedOutcomes) {
writer.append(String.format("%1$-30s", weightedOutcome.getOutcome()) + String.format("%1$#15s", decFormat.format(weightedOutcome.getWeight())) + "\n");
}
writer.append("\n");
writer.flush();
}
use of com.joliciel.talismane.utils.WeightedOutcome in project talismane by joliciel-informatique.
the class StringCollectionFeatureWrapper method check.
@Override
public FeatureResult<List<WeightedOutcome<String>>> check(T context, RuntimeEnvironment env) throws TalismaneException {
List<WeightedOutcome<String>> finalList = new ArrayList<WeightedOutcome<String>>();
FeatureResult<List<WeightedOutcome<String>>> finalResult = null;
// get the collection results for each enclosed collection
List<FeatureResult<List<WeightedOutcome<String>>>> collectionResultList = new ArrayList<FeatureResult<List<WeightedOutcome<String>>>>();
for (StringCollectionFeature<T> collectionFeature : collectionFeatures) {
FeatureResult<List<WeightedOutcome<String>>> collectionResults = collectionFeature.check(context, env);
if (collectionResults != null)
collectionResultList.add(collectionResults);
}
if (collectionResultList.size() > 0) {
// we do a cross product of all of the results from all of the
// enclosed collections
List<List<CollectionFeatureResult>> crossProduct = new ArrayList<List<CollectionFeatureResult>>();
crossProduct.add(new ArrayList<CollectionFeatureResult>());
for (FeatureResult<List<WeightedOutcome<String>>> collectionResults : collectionResultList) {
String featureName = collectionResults.getFeature().getName();
List<List<CollectionFeatureResult>> newCrossProduct = new ArrayList<List<CollectionFeatureResult>>();
for (WeightedOutcome<String> collectionResult : collectionResults.getOutcome()) {
for (List<CollectionFeatureResult> oneList : crossProduct) {
List<CollectionFeatureResult> newList = new ArrayList<CollectionFeatureResult>(oneList);
CollectionFeatureResult result = new CollectionFeatureResult();
result.featureName = featureName;
result.outcome = collectionResult.getOutcome();
result.weight = collectionResult.getWeight();
newList.add(result);
newCrossProduct.add(newList);
}
}
crossProduct = newCrossProduct;
}
// the cross-product
for (List<CollectionFeatureResult> oneCollectionResultSet : crossProduct) {
String collectionResult = "";
double weight = 1.0;
for (CollectionFeatureResult result : oneCollectionResultSet) {
env.setValue(result.featureName, result.outcome);
collectionResult += result.outcome + "|";
weight *= result.weight;
}
FeatureResult<?> featureResult = wrappedFeature.check(context, env);
if (featureResult != null) {
if (wrappedFeature.getFeatureType().equals(StringFeature.class)) {
String outcome = (String) featureResult.getOutcome();
finalList.add(new WeightedOutcome<String>(outcome, weight));
} else if (wrappedFeature.getFeatureType().equals(BooleanFeature.class)) {
Boolean outcome = (Boolean) featureResult.getOutcome();
finalList.add(new WeightedOutcome<String>(collectionResult + outcome.toString(), weight));
} else if (wrappedFeature.getFeatureType().equals(DoubleFeature.class)) {
Double outcome = (Double) featureResult.getOutcome();
finalList.add(new WeightedOutcome<String>(collectionResult, weight * outcome.doubleValue()));
} else if (wrappedFeature.getFeatureType().equals(IntegerFeature.class)) {
Integer outcome = (Integer) featureResult.getOutcome();
finalList.add(new WeightedOutcome<String>(collectionResult, weight * outcome.doubleValue()));
} else {
throw new JolicielException("Cannot include collections in a top-level feature of type: " + wrappedFeature.getFeatureType().getSimpleName());
}
}
}
if (finalList.size() > 0)
finalResult = this.generateResult(finalList);
}
return finalResult;
}
use of com.joliciel.talismane.utils.WeightedOutcome in project talismane by joliciel-informatique.
the class SpmrlConverter method main.
public static void main(String[] args) throws Exception {
Map<String, String> argMap = StringUtils.convertArgs(args);
String logConfigPath = argMap.get("logConfigFile");
argMap.remove("logConfigFile");
if (logConfigPath != null)
LogUtils.configureLogging(logConfigPath);
String spmrlPath = "";
String suffix = "tal";
boolean compressCompounds = true;
boolean convertCompounds = false;
String inDirPath = null;
String outDirPath = null;
String inSuffix = ".conll";
String posTagSetPath = null;
for (String argName : argMap.keySet()) {
String argValue = argMap.get(argName);
if (argName.equals("inFile")) {
spmrlPath = argValue;
} else if (argName.equals("inDir")) {
inDirPath = argValue;
} else if (argName.equals("outDir")) {
outDirPath = argValue;
} else if (argName.equals("inSuffix")) {
inSuffix = argValue;
} else if (argName.equals("suffix")) {
suffix = argValue;
} else if (argName.equals("compressCompounds")) {
compressCompounds = argValue.equalsIgnoreCase("true");
} else if (argName.equals("convertCompounds")) {
convertCompounds = argValue.equalsIgnoreCase("true");
} else if (argName.equals("posTagSet")) {
posTagSetPath = argValue;
} else {
throw new RuntimeException("Unknown option: " + argName);
}
}
if (!inSuffix.startsWith("."))
inSuffix = "." + inSuffix;
final String inSuffixFinal = inSuffix;
List<File> inFiles = new ArrayList<File>();
if (inDirPath != null) {
File inDir = new File(inDirPath);
File[] inFileArray = inDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(inSuffixFinal);
}
});
for (File file : inFileArray) inFiles.add(file);
} else {
File spmrlFile = new File(spmrlPath);
inFiles.add(spmrlFile);
}
if (posTagSetPath != null) {
File posTagSetFile = new File(posTagSetPath);
try (Scanner posTagSetScanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(posTagSetFile), "UTF-8")))) {
posTagSet = new PosTagSet(posTagSetScanner);
}
}
for (File inFile : inFiles) {
try {
File outDir = inFile.getParentFile();
if (outDirPath != null) {
outDir = new File(outDirPath);
outDir.mkdirs();
}
String fileName = inFile.getName().substring(0, inFile.getName().length() - inSuffix.length()) + "." + suffix;
Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outDir, fileName)), "UTF-8"));
Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "UTF-8")));
List<ConllLine> lines = new ArrayList<SpmrlConverter.ConllLine>();
int lineNumber = 0;
int newLineNumber = 0;
Map<String, Integer> compoundPatternCounts = new TreeMap<String, Integer>();
int nonProjectiveCount = 0;
boolean errorOnNonProjective = false;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
lineNumber++;
LOG.trace(lineNumber + ": " + line);
if (line.trim().length() == 0) {
List<TokenCluster> tokens = new ArrayList<TokenCluster>();
boolean inCluster = false;
ConllLine lastLine = null;
for (ConllLine conllLine : lines) {
TokenCluster tokenCluster = new TokenCluster();
if (!conllLine.label.equals("dep_cpd")) {
if (inCluster) {
tokenCluster = tokens.get(tokens.size() - 1);
inCluster = false;
} else {
tokens.add(tokenCluster);
}
} else {
if (conllLine.index < conllLine.governor) {
// forward looking cluster
if (lastLine != null && lastLine.compPosTag != null) {
tokenCluster = tokens.get(tokens.size() - 1);
} else if (inCluster) {
tokenCluster = tokens.get(tokens.size() - 1);
} else {
inCluster = true;
tokens.add(tokenCluster);
}
} else if (tokens.size() > 0) {
tokenCluster = tokens.get(tokens.size() - 1);
} else {
tokens.add(tokenCluster);
}
}
tokenCluster.add(conllLine);
lastLine = conllLine;
}
List<TokenCluster> newTokens = new ArrayList<TokenCluster>();
for (TokenCluster tokenCluster : tokens) {
if (tokenCluster.size() > 1) {
boolean split = false;
String posTags = "";
String word = "";
for (ConllLine conllLine : tokenCluster) {
posTags += conllLine.posTag2 + "|";
word += conllLine.word + " ";
}
Integer countObj = compoundPatternCounts.get(posTags);
int count = countObj == null ? 0 : countObj.intValue();
count++;
compoundPatternCounts.put(posTags, count);
if (convertCompounds) {
split = true;
if (posTags.equals("NC|ADJ|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
} else if (posTags.equals("NC|NC|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
} else if (posTags.equals("NC|ADJ|ADJ|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(0).index;
tokenCluster.get(2).label = "mod";
tokenCluster.get(2).copyGovernor();
} else if (posTags.equals("NC|P|NC|") || posTags.equals("NC|P+D|NC|") || posTags.equals("NC|P|NPP|") || posTags.equals("NC|P+D|NPP|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "dep";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(1).index;
tokenCluster.get(2).label = "prep";
tokenCluster.get(2).copyGovernor();
} else if (posTags.equals("NC|P|DET|NC|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "dep";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(3).index;
tokenCluster.get(2).label = "det";
tokenCluster.get(2).copyGovernor();
tokenCluster.get(3).governor = tokenCluster.get(1).index;
tokenCluster.get(3).label = "prep";
tokenCluster.get(3).copyGovernor();
} else if (posTags.equals("NC|P|NC|ADJ|") || posTags.equals("NC|P+D|NC|ADJ|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "dep";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(1).index;
tokenCluster.get(2).label = "prep";
tokenCluster.get(2).copyGovernor();
tokenCluster.get(3).governor = tokenCluster.get(2).index;
tokenCluster.get(3).label = "mod";
tokenCluster.get(3).copyGovernor();
} else if (posTags.equals("NC|ADJ|P|NC|") || posTags.equals("NC|ADJ|P+D|NC|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(0).index;
tokenCluster.get(2).label = "dep";
tokenCluster.get(2).copyGovernor();
tokenCluster.get(3).governor = tokenCluster.get(2).index;
tokenCluster.get(3).label = "prep";
tokenCluster.get(3).copyGovernor();
} else if (posTags.equals("ADJ|NC|")) {
tokenCluster.head = 1;
if (tokenCluster.get(1).governor == tokenCluster.get(0).index) {
tokenCluster.get(1).governor = tokenCluster.get(0).governor;
tokenCluster.get(1).label = tokenCluster.get(0).label;
tokenCluster.get(1).projGov = tokenCluster.get(0).projGov;
tokenCluster.get(1).projLabel = tokenCluster.get(0).projLabel;
}
tokenCluster.get(0).governor = tokenCluster.get(1).index;
tokenCluster.get(0).label = "mod";
tokenCluster.get(0).copyGovernor();
} else {
if (posTags.equals("DET|PONCT|DET|") || posTags.equals("DET|DET|")) {
// do nothing
} else {
LOG.debug(posTags + ": " + word);
}
split = false;
}
if (split) {
for (ConllLine conllLine : tokenCluster) {
conllLine.removeMweHead();
}
}
}
if (!compressCompounds)
split = true;
if (split) {
if (tokenCluster.head != 0) {
int oldIndex = tokenCluster.get(0).index;
int newIndex = tokenCluster.get(tokenCluster.head).index;
for (ConllLine conllLine : lines) {
if (conllLine.governor == oldIndex) {
conllLine.governor = newIndex;
}
if (conllLine.projGov == oldIndex) {
conllLine.projGov = newIndex;
}
}
}
for (ConllLine conllLine : tokenCluster) {
TokenCluster newCluster = new TokenCluster();
newCluster.add(conllLine);
newTokens.add(newCluster);
}
} else {
String compPosTag = null;
for (ConllLine conllLine : tokenCluster) {
if (conllLine.compPosTag != null) {
compPosTag = conllLine.compPosTag;
break;
}
}
if (compPosTag == null) {
throw new RuntimeException("Didn't find compPosTag on line: " + tokenCluster.get(0).lineNumber);
}
ConllLine head = null;
for (ConllLine conllLine : tokenCluster) {
if (!conllLine.label.equals("dep_cpd")) {
head = conllLine;
break;
}
}
if (head == null) {
throw new RuntimeException("Didn't find head on line: " + tokenCluster.get(0).lineNumber);
}
tokenCluster.get(0).posTag2 = compPosTag;
tokenCluster.get(0).posTag = compPosTag;
tokenCluster.get(0).governor = head.governor;
tokenCluster.get(0).label = head.label;
tokenCluster.get(0).projGov = head.projGov;
tokenCluster.get(0).projLabel = head.projLabel;
tokenCluster.get(0).removeMweHead();
if (compPosTag.equals("NC") || compPosTag.equals("NPP")) {
tokenCluster.get(0).posTag = "N";
} else if (compPosTag.startsWith("V")) {
tokenCluster.get(0).posTag = "V";
} else if (compPosTag.startsWith("PRO")) {
tokenCluster.get(0).posTag = "PRO";
} else if (compPosTag.startsWith("ADJ")) {
tokenCluster.get(0).posTag = "A";
} else if (compPosTag.startsWith("DET")) {
tokenCluster.get(0).posTag = "D";
} else if (compPosTag.startsWith("CL")) {
tokenCluster.get(0).posTag = "CL";
} else if (compPosTag.startsWith("C")) {
tokenCluster.get(0).posTag = "C";
}
newTokens.add(tokenCluster);
}
} else {
newTokens.add(tokenCluster);
}
// multi-token cluster?
}
tokens = newTokens;
int currentIndex = 1;
Map<Integer, Integer> indexMap = new HashMap<Integer, Integer>();
indexMap.put(0, 0);
for (TokenCluster tokenCluster : tokens) {
tokenCluster.newIndex = currentIndex++;
for (ConllLine conllLine : tokenCluster) {
indexMap.put(conllLine.index, tokenCluster.newIndex);
}
tokenCluster.word = tokenCluster.get(0).word;
tokenCluster.lemma = tokenCluster.get(0).lemma;
for (int i = 1; i < tokenCluster.size(); i++) {
ConllLine conllLine = tokenCluster.get(i);
if (tokenCluster.word.length() == 0 || tokenCluster.word.endsWith("'") || tokenCluster.word.endsWith("-") || tokenCluster.word.endsWith(",") || conllLine.word.startsWith("-") || conllLine.word.equals(",")) {
tokenCluster.word += conllLine.word;
} else {
tokenCluster.word += "_" + conllLine.word;
}
}
if (tokenCluster.size() > 1) {
tokenCluster.lemma = tokenCluster.word;
if (Character.isUpperCase(tokenCluster.lemma.charAt(0))) {
if (!Character.isUpperCase(tokenCluster.get(0).lemma.charAt(0))) {
tokenCluster.lemma = tokenCluster.get(0).lemma.charAt(0) + tokenCluster.lemma.substring(1);
}
}
}
}
List<ConllLine> newLines = new ArrayList<SpmrlConverter.ConllLine>();
for (TokenCluster tokenCluster : tokens) {
ConllLine conllLine = tokenCluster.get(0);
if (conllLine.posTag2 == null || conllLine.posTag2.equals("null") || conllLine.posTag2.equals("UNK")) {
throw new RuntimeException("Bad postag on line: " + lineNumber + ": " + conllLine);
}
newLineNumber++;
String newLine = tokenCluster.newIndex + "\t" + tokenCluster.word + "\t" + tokenCluster.lemma + "\t" + conllLine.posTag + "\t" + conllLine.posTag2 + "\t" + conllLine.morph + "\t" + indexMap.get(conllLine.governor) + "\t" + conllLine.label + "\t" + indexMap.get(conllLine.projGov) + "\t" + conllLine.projLabel;
ConllLine newConllLine = new ConllLine(newLine, lineNumber, newLineNumber);
newLines.add(newConllLine);
}
for (ConllLine conllLine : newLines) {
if (conllLine.word.toLowerCase().equals("car") && conllLine.posTag2.equals("CC")) {
conllLine.posTag2 = "CS";
conllLine.morph = "s=s";
if (conllLine.label.equals("coord")) {
conllLine.label = "mod";
}
if (conllLine.projLabel.equals("coord")) {
conllLine.projLabel = "mod";
}
for (ConllLine otherLine : newLines) {
if (otherLine.governor == conllLine.index && otherLine.label.equals("dep_coord")) {
otherLine.label = "sub";
}
if (otherLine.projGov == conllLine.index && otherLine.projLabel.equals("dep_coord")) {
otherLine.projLabel = "sub";
}
}
}
}
int i = 0;
boolean hasNonProjective = false;
for (ConllLine conllLine : newLines) {
i++;
int headIndex = conllLine.projGov;
int depIndex = conllLine.index;
int startIndex = headIndex < depIndex ? headIndex : depIndex;
int endIndex = headIndex >= depIndex ? headIndex : depIndex;
int j = 0;
for (ConllLine otherLine : newLines) {
j++;
if (j <= i)
continue;
int headIndex2 = otherLine.projGov;
int depIndex2 = otherLine.index;
int startIndex2 = headIndex2 < depIndex2 ? headIndex2 : depIndex2;
int endIndex2 = headIndex2 >= depIndex2 ? headIndex2 : depIndex2;
boolean nonProjective = false;
if (startIndex2 < startIndex && endIndex2 > startIndex && endIndex2 < endIndex) {
nonProjective = true;
} else if (startIndex2 > startIndex && startIndex2 < endIndex && endIndex2 > endIndex) {
nonProjective = true;
}
if (nonProjective) {
LOG.error("Non-projective arcs at line: " + lineNumber);
LOG.error(conllLine.lineNumber + ": " + conllLine.toString());
LOG.error(otherLine.lineNumber + ": " + otherLine.toString());
hasNonProjective = true;
nonProjectiveCount++;
}
}
}
for (ConllLine conllLine : newLines) {
writer.write(conllLine.toString() + "\n");
}
newLineNumber++;
writer.write("\n");
writer.flush();
if (errorOnNonProjective && hasNonProjective)
throw new RuntimeException("Found non projective arc");
lines = new ArrayList<SpmrlConverter.ConllLine>();
} else {
ConllLine conllLine = new ConllLine(line, lineNumber, lineNumber);
lines.add(conllLine);
}
}
scanner.close();
writer.close();
Set<WeightedOutcome<String>> counts = new TreeSet<WeightedOutcome<String>>();
for (String posTags : compoundPatternCounts.keySet()) {
counts.add(new WeightedOutcome<String>(posTags, compoundPatternCounts.get(posTags)));
}
for (WeightedOutcome<String> count : counts) {
LOG.info(count.getOutcome() + ": " + count.getWeight());
}
LOG.info("non projective count: " + nonProjectiveCount);
} catch (Exception e) {
LogUtils.logError(LOG, e);
}
}
}
Aggregations