use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.
the class PosTagSetFeature method checkInternal.
@Override
public FeatureResult<List<WeightedOutcome<String>>> checkInternal(TokenWrapper context, RuntimeEnvironment env) {
PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
Set<PosTag> posTags = posTagSet.getTags();
List<WeightedOutcome<String>> resultList = new ArrayList<WeightedOutcome<String>>();
for (PosTag posTag : posTags) {
resultList.add(new WeightedOutcome<String>(posTag.getCode(), 1.0));
}
return this.generateResult(resultList);
}
use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.
the class PosTaggerStatisticsWriter method onCompleteAnalysis.
@Override
public void onCompleteAnalysis() throws IOException {
if (writer != null) {
PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
for (PosTag posTag : posTagSet.getTags()) {
if (!stats.posTagCounts.containsKey(posTag.getCode())) {
stats.posTagCounts.put(posTag.getCode(), 0);
}
}
double unknownLexiconPercent = 1;
if (referenceStats != null) {
int unknownLexiconCount = 0;
for (String word : stats.words) {
if (!referenceStats.words.contains(word))
unknownLexiconCount++;
}
unknownLexiconPercent = (double) unknownLexiconCount / (double) stats.words.size();
}
double unknownLowercaseLexiconPercent = 1;
if (referenceStats != null) {
int unknownLowercaseLexiconCount = 0;
for (String lowercase : stats.lowerCaseWords) {
if (!referenceStats.lowerCaseWords.contains(lowercase))
unknownLowercaseLexiconCount++;
}
unknownLowercaseLexiconPercent = (double) unknownLowercaseLexiconCount / (double) stats.lowerCaseWords.size();
}
writer.write(CSV.format("sentenceCount") + CSV.format(stats.sentenceCount) + "\n");
writer.write(CSV.format("sentenceLengthMean") + CSV.format(stats.sentenceLengthStats.getMean()) + "\n");
writer.write(CSV.format("sentenceLengthStdDev") + CSV.format(stats.sentenceLengthStats.getStandardDeviation()) + "\n");
writer.write(CSV.format("lexiconSize") + CSV.format(stats.words.size()) + "\n");
writer.write(CSV.format("lexiconUnknownInRefCorpus") + CSV.format(unknownLexiconPercent * 100.0) + "\n");
writer.write(CSV.format("tokenCount") + CSV.format(stats.tokenCount) + "\n");
double unknownTokenPercent = ((double) stats.unknownTokenCount / (double) stats.tokenCount) * 100.0;
writer.write(CSV.format("tokenUnknownInRefCorpus") + CSV.format(unknownTokenPercent) + "\n");
double unknownInLexiconPercent = ((double) stats.unknownInLexiconCount / (double) stats.tokenCount) * 100.0;
writer.write(CSV.format("tokenUnknownInRefLexicon") + CSV.format(unknownInLexiconPercent) + "\n");
writer.write(CSV.format("lowercaseLexiconSize") + CSV.format(stats.lowerCaseWords.size()) + "\n");
writer.write(CSV.format("lowercaseLexiconUnknownInRefCorpus") + CSV.format(unknownLowercaseLexiconPercent * 100.0) + "\n");
writer.write(CSV.format("alphanumericCount") + CSV.format(stats.alphanumericCount) + "\n");
double unknownAlphanumericPercent = ((double) stats.unknownAlphanumericCount / (double) stats.alphanumericCount) * 100.0;
writer.write(CSV.format("alphaUnknownInRefCorpus") + CSV.format(unknownAlphanumericPercent) + "\n");
double unknownAlphaInLexiconPercent = ((double) stats.unknownAlphaInLexiconCount / (double) stats.alphanumericCount) * 100.0;
writer.write(CSV.format("alphaUnknownInRefLexicon") + CSV.format(unknownAlphaInLexiconPercent) + "\n");
writer.write(CSV.format("openClassCount") + CSV.format(stats.openClassCount) + "\n");
double openClassUnknownPercent = ((double) stats.openClassUnknownInRefCorpus / (double) stats.openClassCount) * 100.0;
writer.write(CSV.format("openClassUnknownInRefCorpus") + CSV.format(openClassUnknownPercent) + "\n");
double openClassUnknownInLexiconPercent = ((double) stats.openClassUnknownInLexicon / (double) stats.openClassCount) * 100.0;
writer.write(CSV.format("openClassUnknownInRefLexicon") + CSV.format(openClassUnknownInLexiconPercent) + "\n");
writer.write(CSV.format("closedClassCount") + CSV.format(stats.closedClassCount) + "\n");
double closedClassUnknownPercent = ((double) stats.closedClassUnknownInRefCorpus / (double) stats.closedClassCount) * 100.0;
writer.write(CSV.format("closedClassUnknownInRefCorpus") + CSV.format(closedClassUnknownPercent) + "\n");
double closedClassUnknownInLexiconPercent = ((double) stats.closedClassUnknownInLexicon / (double) stats.closedClassCount) * 100.0;
writer.write(CSV.format("closedClassUnknownInRefLexicon") + CSV.format(closedClassUnknownInLexiconPercent) + "\n");
for (String posTag : stats.posTagCounts.keySet()) {
int count = stats.posTagCounts.get(posTag);
writer.write(CSV.format(posTag) + CSV.format(count) + CSV.format(((double) count / (double) stats.tokenCount) * 100.0) + "\n");
}
writer.flush();
writer.close();
}
if (this.serializationFile != null) {
ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(serializationFile, false));
zos.putNextEntry(new ZipEntry("Contents.obj"));
ObjectOutputStream oos = new ObjectOutputStream(zos);
try {
oos.writeObject(stats);
} finally {
oos.flush();
}
zos.flush();
zos.close();
}
}
use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.
the class SpmrlConverter method main.
public static void main(String[] args) throws Exception {
Map<String, String> argMap = StringUtils.convertArgs(args);
String logConfigPath = argMap.get("logConfigFile");
argMap.remove("logConfigFile");
if (logConfigPath != null)
LogUtils.configureLogging(logConfigPath);
String spmrlPath = "";
String suffix = "tal";
boolean compressCompounds = true;
boolean convertCompounds = false;
String inDirPath = null;
String outDirPath = null;
String inSuffix = ".conll";
String posTagSetPath = null;
for (String argName : argMap.keySet()) {
String argValue = argMap.get(argName);
if (argName.equals("inFile")) {
spmrlPath = argValue;
} else if (argName.equals("inDir")) {
inDirPath = argValue;
} else if (argName.equals("outDir")) {
outDirPath = argValue;
} else if (argName.equals("inSuffix")) {
inSuffix = argValue;
} else if (argName.equals("suffix")) {
suffix = argValue;
} else if (argName.equals("compressCompounds")) {
compressCompounds = argValue.equalsIgnoreCase("true");
} else if (argName.equals("convertCompounds")) {
convertCompounds = argValue.equalsIgnoreCase("true");
} else if (argName.equals("posTagSet")) {
posTagSetPath = argValue;
} else {
throw new RuntimeException("Unknown option: " + argName);
}
}
if (!inSuffix.startsWith("."))
inSuffix = "." + inSuffix;
final String inSuffixFinal = inSuffix;
List<File> inFiles = new ArrayList<File>();
if (inDirPath != null) {
File inDir = new File(inDirPath);
File[] inFileArray = inDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(inSuffixFinal);
}
});
for (File file : inFileArray) inFiles.add(file);
} else {
File spmrlFile = new File(spmrlPath);
inFiles.add(spmrlFile);
}
if (posTagSetPath != null) {
File posTagSetFile = new File(posTagSetPath);
try (Scanner posTagSetScanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(posTagSetFile), "UTF-8")))) {
posTagSet = new PosTagSet(posTagSetScanner);
}
}
for (File inFile : inFiles) {
try {
File outDir = inFile.getParentFile();
if (outDirPath != null) {
outDir = new File(outDirPath);
outDir.mkdirs();
}
String fileName = inFile.getName().substring(0, inFile.getName().length() - inSuffix.length()) + "." + suffix;
Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outDir, fileName)), "UTF-8"));
Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "UTF-8")));
List<ConllLine> lines = new ArrayList<SpmrlConverter.ConllLine>();
int lineNumber = 0;
int newLineNumber = 0;
Map<String, Integer> compoundPatternCounts = new TreeMap<String, Integer>();
int nonProjectiveCount = 0;
boolean errorOnNonProjective = false;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
lineNumber++;
LOG.trace(lineNumber + ": " + line);
if (line.trim().length() == 0) {
List<TokenCluster> tokens = new ArrayList<TokenCluster>();
boolean inCluster = false;
ConllLine lastLine = null;
for (ConllLine conllLine : lines) {
TokenCluster tokenCluster = new TokenCluster();
if (!conllLine.label.equals("dep_cpd")) {
if (inCluster) {
tokenCluster = tokens.get(tokens.size() - 1);
inCluster = false;
} else {
tokens.add(tokenCluster);
}
} else {
if (conllLine.index < conllLine.governor) {
// forward looking cluster
if (lastLine != null && lastLine.compPosTag != null) {
tokenCluster = tokens.get(tokens.size() - 1);
} else if (inCluster) {
tokenCluster = tokens.get(tokens.size() - 1);
} else {
inCluster = true;
tokens.add(tokenCluster);
}
} else if (tokens.size() > 0) {
tokenCluster = tokens.get(tokens.size() - 1);
} else {
tokens.add(tokenCluster);
}
}
tokenCluster.add(conllLine);
lastLine = conllLine;
}
List<TokenCluster> newTokens = new ArrayList<TokenCluster>();
for (TokenCluster tokenCluster : tokens) {
if (tokenCluster.size() > 1) {
boolean split = false;
String posTags = "";
String word = "";
for (ConllLine conllLine : tokenCluster) {
posTags += conllLine.posTag2 + "|";
word += conllLine.word + " ";
}
Integer countObj = compoundPatternCounts.get(posTags);
int count = countObj == null ? 0 : countObj.intValue();
count++;
compoundPatternCounts.put(posTags, count);
if (convertCompounds) {
split = true;
if (posTags.equals("NC|ADJ|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
} else if (posTags.equals("NC|NC|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
} else if (posTags.equals("NC|ADJ|ADJ|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(0).index;
tokenCluster.get(2).label = "mod";
tokenCluster.get(2).copyGovernor();
} else if (posTags.equals("NC|P|NC|") || posTags.equals("NC|P+D|NC|") || posTags.equals("NC|P|NPP|") || posTags.equals("NC|P+D|NPP|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "dep";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(1).index;
tokenCluster.get(2).label = "prep";
tokenCluster.get(2).copyGovernor();
} else if (posTags.equals("NC|P|DET|NC|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "dep";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(3).index;
tokenCluster.get(2).label = "det";
tokenCluster.get(2).copyGovernor();
tokenCluster.get(3).governor = tokenCluster.get(1).index;
tokenCluster.get(3).label = "prep";
tokenCluster.get(3).copyGovernor();
} else if (posTags.equals("NC|P|NC|ADJ|") || posTags.equals("NC|P+D|NC|ADJ|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "dep";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(1).index;
tokenCluster.get(2).label = "prep";
tokenCluster.get(2).copyGovernor();
tokenCluster.get(3).governor = tokenCluster.get(2).index;
tokenCluster.get(3).label = "mod";
tokenCluster.get(3).copyGovernor();
} else if (posTags.equals("NC|ADJ|P|NC|") || posTags.equals("NC|ADJ|P+D|NC|")) {
tokenCluster.head = 0;
tokenCluster.get(1).governor = tokenCluster.get(0).index;
tokenCluster.get(1).label = "mod";
tokenCluster.get(1).copyGovernor();
tokenCluster.get(2).governor = tokenCluster.get(0).index;
tokenCluster.get(2).label = "dep";
tokenCluster.get(2).copyGovernor();
tokenCluster.get(3).governor = tokenCluster.get(2).index;
tokenCluster.get(3).label = "prep";
tokenCluster.get(3).copyGovernor();
} else if (posTags.equals("ADJ|NC|")) {
tokenCluster.head = 1;
if (tokenCluster.get(1).governor == tokenCluster.get(0).index) {
tokenCluster.get(1).governor = tokenCluster.get(0).governor;
tokenCluster.get(1).label = tokenCluster.get(0).label;
tokenCluster.get(1).projGov = tokenCluster.get(0).projGov;
tokenCluster.get(1).projLabel = tokenCluster.get(0).projLabel;
}
tokenCluster.get(0).governor = tokenCluster.get(1).index;
tokenCluster.get(0).label = "mod";
tokenCluster.get(0).copyGovernor();
} else {
if (posTags.equals("DET|PONCT|DET|") || posTags.equals("DET|DET|")) {
// do nothing
} else {
LOG.debug(posTags + ": " + word);
}
split = false;
}
if (split) {
for (ConllLine conllLine : tokenCluster) {
conllLine.removeMweHead();
}
}
}
if (!compressCompounds)
split = true;
if (split) {
if (tokenCluster.head != 0) {
int oldIndex = tokenCluster.get(0).index;
int newIndex = tokenCluster.get(tokenCluster.head).index;
for (ConllLine conllLine : lines) {
if (conllLine.governor == oldIndex) {
conllLine.governor = newIndex;
}
if (conllLine.projGov == oldIndex) {
conllLine.projGov = newIndex;
}
}
}
for (ConllLine conllLine : tokenCluster) {
TokenCluster newCluster = new TokenCluster();
newCluster.add(conllLine);
newTokens.add(newCluster);
}
} else {
String compPosTag = null;
for (ConllLine conllLine : tokenCluster) {
if (conllLine.compPosTag != null) {
compPosTag = conllLine.compPosTag;
break;
}
}
if (compPosTag == null) {
throw new RuntimeException("Didn't find compPosTag on line: " + tokenCluster.get(0).lineNumber);
}
ConllLine head = null;
for (ConllLine conllLine : tokenCluster) {
if (!conllLine.label.equals("dep_cpd")) {
head = conllLine;
break;
}
}
if (head == null) {
throw new RuntimeException("Didn't find head on line: " + tokenCluster.get(0).lineNumber);
}
tokenCluster.get(0).posTag2 = compPosTag;
tokenCluster.get(0).posTag = compPosTag;
tokenCluster.get(0).governor = head.governor;
tokenCluster.get(0).label = head.label;
tokenCluster.get(0).projGov = head.projGov;
tokenCluster.get(0).projLabel = head.projLabel;
tokenCluster.get(0).removeMweHead();
if (compPosTag.equals("NC") || compPosTag.equals("NPP")) {
tokenCluster.get(0).posTag = "N";
} else if (compPosTag.startsWith("V")) {
tokenCluster.get(0).posTag = "V";
} else if (compPosTag.startsWith("PRO")) {
tokenCluster.get(0).posTag = "PRO";
} else if (compPosTag.startsWith("ADJ")) {
tokenCluster.get(0).posTag = "A";
} else if (compPosTag.startsWith("DET")) {
tokenCluster.get(0).posTag = "D";
} else if (compPosTag.startsWith("CL")) {
tokenCluster.get(0).posTag = "CL";
} else if (compPosTag.startsWith("C")) {
tokenCluster.get(0).posTag = "C";
}
newTokens.add(tokenCluster);
}
} else {
newTokens.add(tokenCluster);
}
// multi-token cluster?
}
tokens = newTokens;
int currentIndex = 1;
Map<Integer, Integer> indexMap = new HashMap<Integer, Integer>();
indexMap.put(0, 0);
for (TokenCluster tokenCluster : tokens) {
tokenCluster.newIndex = currentIndex++;
for (ConllLine conllLine : tokenCluster) {
indexMap.put(conllLine.index, tokenCluster.newIndex);
}
tokenCluster.word = tokenCluster.get(0).word;
tokenCluster.lemma = tokenCluster.get(0).lemma;
for (int i = 1; i < tokenCluster.size(); i++) {
ConllLine conllLine = tokenCluster.get(i);
if (tokenCluster.word.length() == 0 || tokenCluster.word.endsWith("'") || tokenCluster.word.endsWith("-") || tokenCluster.word.endsWith(",") || conllLine.word.startsWith("-") || conllLine.word.equals(",")) {
tokenCluster.word += conllLine.word;
} else {
tokenCluster.word += "_" + conllLine.word;
}
}
if (tokenCluster.size() > 1) {
tokenCluster.lemma = tokenCluster.word;
if (Character.isUpperCase(tokenCluster.lemma.charAt(0))) {
if (!Character.isUpperCase(tokenCluster.get(0).lemma.charAt(0))) {
tokenCluster.lemma = tokenCluster.get(0).lemma.charAt(0) + tokenCluster.lemma.substring(1);
}
}
}
}
List<ConllLine> newLines = new ArrayList<SpmrlConverter.ConllLine>();
for (TokenCluster tokenCluster : tokens) {
ConllLine conllLine = tokenCluster.get(0);
if (conllLine.posTag2 == null || conllLine.posTag2.equals("null") || conllLine.posTag2.equals("UNK")) {
throw new RuntimeException("Bad postag on line: " + lineNumber + ": " + conllLine);
}
newLineNumber++;
String newLine = tokenCluster.newIndex + "\t" + tokenCluster.word + "\t" + tokenCluster.lemma + "\t" + conllLine.posTag + "\t" + conllLine.posTag2 + "\t" + conllLine.morph + "\t" + indexMap.get(conllLine.governor) + "\t" + conllLine.label + "\t" + indexMap.get(conllLine.projGov) + "\t" + conllLine.projLabel;
ConllLine newConllLine = new ConllLine(newLine, lineNumber, newLineNumber);
newLines.add(newConllLine);
}
for (ConllLine conllLine : newLines) {
if (conllLine.word.toLowerCase().equals("car") && conllLine.posTag2.equals("CC")) {
conllLine.posTag2 = "CS";
conllLine.morph = "s=s";
if (conllLine.label.equals("coord")) {
conllLine.label = "mod";
}
if (conllLine.projLabel.equals("coord")) {
conllLine.projLabel = "mod";
}
for (ConllLine otherLine : newLines) {
if (otherLine.governor == conllLine.index && otherLine.label.equals("dep_coord")) {
otherLine.label = "sub";
}
if (otherLine.projGov == conllLine.index && otherLine.projLabel.equals("dep_coord")) {
otherLine.projLabel = "sub";
}
}
}
}
int i = 0;
boolean hasNonProjective = false;
for (ConllLine conllLine : newLines) {
i++;
int headIndex = conllLine.projGov;
int depIndex = conllLine.index;
int startIndex = headIndex < depIndex ? headIndex : depIndex;
int endIndex = headIndex >= depIndex ? headIndex : depIndex;
int j = 0;
for (ConllLine otherLine : newLines) {
j++;
if (j <= i)
continue;
int headIndex2 = otherLine.projGov;
int depIndex2 = otherLine.index;
int startIndex2 = headIndex2 < depIndex2 ? headIndex2 : depIndex2;
int endIndex2 = headIndex2 >= depIndex2 ? headIndex2 : depIndex2;
boolean nonProjective = false;
if (startIndex2 < startIndex && endIndex2 > startIndex && endIndex2 < endIndex) {
nonProjective = true;
} else if (startIndex2 > startIndex && startIndex2 < endIndex && endIndex2 > endIndex) {
nonProjective = true;
}
if (nonProjective) {
LOG.error("Non-projective arcs at line: " + lineNumber);
LOG.error(conllLine.lineNumber + ": " + conllLine.toString());
LOG.error(otherLine.lineNumber + ": " + otherLine.toString());
hasNonProjective = true;
nonProjectiveCount++;
}
}
}
for (ConllLine conllLine : newLines) {
writer.write(conllLine.toString() + "\n");
}
newLineNumber++;
writer.write("\n");
writer.flush();
if (errorOnNonProjective && hasNonProjective)
throw new RuntimeException("Found non projective arc");
lines = new ArrayList<SpmrlConverter.ConllLine>();
} else {
ConllLine conllLine = new ConllLine(line, lineNumber, lineNumber);
lines.add(conllLine);
}
}
scanner.close();
writer.close();
Set<WeightedOutcome<String>> counts = new TreeSet<WeightedOutcome<String>>();
for (String posTags : compoundPatternCounts.keySet()) {
counts.add(new WeightedOutcome<String>(posTags, compoundPatternCounts.get(posTags)));
}
for (WeightedOutcome<String> count : counts) {
LOG.info(count.getOutcome() + ": " + count.getWeight());
}
LOG.info("non projective count: " + nonProjectiveCount);
} catch (Exception e) {
LogUtils.logError(LOG, e);
}
}
}
use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.
the class LexiconReaderTest method testReadLexicons.
@Test
public void testReadLexicons() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
PosTaggerLexicon lexicon = TalismaneSession.get(sessionId).getMergedLexicon();
List<LexicalEntry> entries = lexicon.getEntries("dame");
for (LexicalEntry entry : entries) {
System.out.println(entry);
}
assertEquals(9, entries.size());
PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
entries = lexicon.findLexicalEntries("dame", posTagSet.getPosTag("NC"));
for (LexicalEntry entry : entries) {
System.out.println(entry);
}
assertEquals(2, entries.size());
Set<PosTag> posTags = lexicon.findPossiblePosTags("dame");
System.out.println(posTags);
assertEquals(4, posTags.size());
System.clearProperty("config.file");
ConfigFactory.invalidateCaches();
}
Aggregations