use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ProcessTwnertcData method main.
public static void main(String[] args) throws IOException {
Path corpus = Paths.get("/media/ahmetaa/depo/ner/TWNERTC_All_Versions/TWNERTC_TC_Coarse_Grained_NER_DomainDependent_NoiseReduction.DUMP");
Path nerOut = Paths.get("/media/ahmetaa/depo/ner/ner-coarse");
Path categoryOut = Paths.get("/media/ahmetaa/depo/classification/twnertc-data");
BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
List<String> nerLines = new ArrayList<>();
List<String> categoryLines = new ArrayList<>();
Histogram<String> categories = new Histogram<>();
for (TextChunk chunk : loader) {
for (String line : chunk) {
List<String> parts = TextUtil.TAB_SPLITTER.splitToList(line);
categoryLines.add("__label__" + parts.get(0) + " " + parts.get(2));
categories.add(parts.get(0));
List<String> nerLabels = TextUtil.SPACE_SPLITTER.splitToList(parts.get(1));
List<String> nerWords = TextUtil.SPACE_SPLITTER.splitToList(parts.get(2));
if (nerLabels.size() != nerWords.size()) {
continue;
}
List<NerRange> ranges = new ArrayList<>();
NerRange range = new NerRange();
for (int i = 0; i < nerLabels.size(); i++) {
String lbl = nerLabels.get(i);
String word = nerWords.get(i);
if (lbl.equals("O")) {
if (range.type == null) {
range.type = "O";
} else {
if (range.type.equals("O")) {
range.seq.add(word);
} else {
ranges.add(range);
range = new NerRange();
range.type = "O";
range.seq.add(word);
}
}
}
}
}
Log.info(chunk.index * loader.getBlockSize());
}
Files.write(categoryOut, categoryLines);
categories.saveSortedByCounts(Paths.get("/media/ahmetaa/depo/classification/categories"), " ");
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method splitWords.
static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
Log.info("Language model = %s", lm.info());
Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
Log.info("%d words loaded.", wordFreq.size());
wordFreq.removeSmaller(minWordCount);
if (minWordCount > 1) {
Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
}
int unkIndex = lm.getVocabulary().getUnknownWordIndex();
try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
for (String word : wordFreq.getSortedList()) {
if (asciiMapKeys.contains(word)) {
continue;
}
if (word.length() < 5 || word.contains("-")) {
continue;
}
List<ScoredItem<String>> k = new ArrayList<>();
for (int i = 1; i < word.length() - 1; i++) {
String head = word.substring(0, i);
String tail = word.substring(i);
if (noSplitTails.contains(tail)) {
continue;
}
int hi = lm.getVocabulary().indexOf(head);
int ti = lm.getVocabulary().indexOf(tail);
if (hi == unkIndex || ti == unkIndex) {
continue;
}
if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
continue;
}
if (lm.ngramExists(hi, ti)) {
k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
}
}
if (k.size() > 1) {
k.sort((a, b) -> Double.compare(b.score, a.score));
}
if (k.size() > 0) {
ScoredItem<String> best = k.get(0);
if (best.score > -7) {
pw.println(word + " = " + best.item);
pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
}
}
}
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ClassificationExperiment method countTokens.
static void countTokens(Path... paths) throws IOException {
for (Path path : paths) {
List<String> lines = TextIO.loadLines(path);
Histogram<String> hw = new Histogram<>();
Histogram<String> hl = new Histogram<>();
for (String l : lines) {
for (String s : l.split("[\\s]+")) {
if (s.contains("__label__")) {
if (s.contains("-")) {
Log.warn(l);
}
hl.add(s);
} else {
hw.add(s);
}
}
}
Log.info("There are %d lines, %d words, %d labels in %s", lines.size(), hw.size(), hl.size(), path);
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ConllTreebankReader method dumpStats.
public void dumpStats(List<DependencySentence> sentences, File statFile) throws IOException {
Histogram<CoarsePosTag> coarsePos = new Histogram<>();
Histogram<PosTag> pos = new Histogram<>();
Histogram<DependencyRelation> depRelations = new Histogram<>();
Histogram<String> morphItems = new Histogram<>();
for (DependencySentence sentence : sentences) {
for (DependencyItem item : sentence.items) {
coarsePos.add(item.coarsePosTag);
pos.add(item.posTag);
depRelations.add(item.depRelation);
morphItems.add(Lists.newArrayList(Splitter.on("|").trimResults().omitEmptyStrings().split(item.feats)));
}
}
SimpleTextWriter writer = SimpleTextWriter.keepOpenUTF8Writer(statFile);
writer.writeLine("Sentence count:" + sentences.size());
writer.writeLine("\nCoarse POS values:\n");
for (CoarsePosTag coarsePo : coarsePos.getSortedList()) {
writer.writeLine(coarsePo.getAsConnlValue() + " : " + coarsePos.getCount(coarsePo));
}
writer.writeLine("\nPOS values:\n");
for (PosTag posTag : pos.getSortedList()) {
writer.writeLine(posTag.getAsConnlValue() + " : " + pos.getCount(posTag));
}
writer.writeLine("\nDEP Rel values:\n");
for (DependencyRelation depRel : depRelations.getSortedList()) {
writer.writeLine(depRel.getAsConnlString() + " : " + depRelations.getCount(depRel));
}
for (String morphItem : morphItems.getSortedList()) {
writer.writeLine(morphItem + " : " + morphItems.getCount(morphItem));
}
writer.close();
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class Scripts method checkWeirdChars.
private static void checkWeirdChars(Path root) throws IOException {
List<Path> files = Files.walk(root, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
Histogram<String> chars = new Histogram<>();
for (Path file : files) {
System.out.println(file);
LinkedHashSet<String> sentences = getSentences(file);
for (String sentence : sentences) {
for (int i = 0; i < sentence.length(); i++) {
char c = sentence.charAt(i);
if (c >= 0x300 && c <= 0x036f) {
chars.add(String.valueOf(c));
}
if (Scripts.undesiredChars.contains(c)) {
chars.add(String.valueOf(c));
}
}
}
}
for (String s : chars.getSortedList()) {
System.out.println(String.format("%x %d", (int) s.charAt(0), chars.getCount(s)));
}
}
Aggregations