Search in sources :

Example 81 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method guessRootsWithHeuristics.

@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
    Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
    Log.info("Loading histogram.");
    List<String> words = Files.readAllLines(wordFreqFile);
    TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
    // dictionaryLoader.load("elma");
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
    Multimap<String, String> res = HashMultimap.create(100000, 3);
    int c = 0;
    for (String s : words) {
        if (s.length() < 4) {
            continue;
        }
        if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
            continue;
        }
        for (int i = 2; i < s.length(); i++) {
            String candidateRoot = s.substring(0, i + 1);
            if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
                continue;
            }
            List<DictionaryItem> items = new ArrayList<>(3);
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
            char last = candidateRoot.charAt(candidateRoot.length() - 1);
            if (i < s.length() - 1) {
                char next = s.charAt(candidateRoot.length());
                if (Turkish.Alphabet.isVowel(next)) {
                    String f = "";
                    if (last == 'b') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
                    } else if (last == 'c') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
                    } else if (last == 'ğ') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
                    }
                    if (last == 'd') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
                    }
                    if (f.length() > 0) {
                        items.add(TurkishDictionaryLoader.loadFromString(f));
                    }
                }
            }
            for (DictionaryItem item : items) {
                morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
                WordAnalysis analyze = morphology.analyze(s);
                for (SingleAnalysis wordAnalysis : analyze) {
                    if (!wordAnalysis.isUnknown()) {
                        res.put(candidateRoot, s);
                    }
                }
                morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
            }
        }
        if (++c % 10000 == 0) {
            Log.info(c);
        }
        if (c == 100000) {
            break;
        }
    }
    Log.info("Writing.");
    try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
        PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
        for (String root : res.keySet()) {
            Collection<String> vals = res.get(root);
            if (vals.size() < 2) {
                continue;
            }
            List<String> wl = new ArrayList<>(vals);
            wl.sort(turkishCollator::compare);
            pw1.println(root + " : " + String.join(", ", vals));
            pw2.println(root);
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 82 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class LoadProperNouns method main.

public static void main(String[] args) throws IOException {
    TurkishMorphology parserGenerator = TurkishMorphology.createWithDefaults();
    List<String> lines = Files.readAllLines(Paths.get("/home/afsina/Downloads/documents-export-2016-02-17/vocabulary-proper-full.tr.txt"));
    Histogram<String> histogram = new Histogram<>();
    Set<String> ignore = new HashSet<>(Files.readAllLines(Paths.get("morphology/src/main/resources/tr/proper-ignore")));
    for (String line : lines) {
        if (line.startsWith("_")) {
            continue;
        }
        line = line.trim();
        if (line.length() == 0) {
            continue;
        }
        String word = Strings.subStringUntilFirst(line, " ");
        int count = Integer.parseInt(Strings.subStringAfterFirst(line, " "));
        word = Turkish.capitalize(word.substring(1));
        if (count < 50) {
            continue;
        }
        if (ignore.contains(word)) {
            continue;
        }
        WordAnalysis parses = parserGenerator.analyze(word);
        boolean found = false;
        for (SingleAnalysis parse : parses) {
            if (parse.getDictionaryItem().secondaryPos.equals(SecondaryPos.ProperNoun) && !parse.getDictionaryItem().hasAttribute(RootAttribute.Runtime)) {
                found = true;
            }
        }
        parserGenerator.invalidateCache();
        if (found) {
            continue;
        }
        if (word.length() < 4) {
            continue;
        }
        histogram.add(word, count);
    }
    histogram.removeSmaller(165);
    try (PrintWriter pw = new PrintWriter("proper")) {
        histogram.getSortedList(Turkish.STRING_COMPARATOR_ASC).forEach(pw::println);
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter)

Example 83 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method generateSetForLabelExperiment.

Set<String> generateSetForLabelExperiment(Path input, TurkishMorphology analyzer, boolean useRoots) throws IOException {
    WebCorpus corpus = new WebCorpus("label", "labeled");
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    Log.info("Extracting data.");
    Histogram<String> labelCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        List<String> labels = document.getLabels();
        List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
        labelCounts.add(lowerCase);
    }
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
    Log.info("All label count = %d", labelCounts.size());
    labelCounts.removeSmaller(15);
    Log.info("Reduced label count = %d", labelCounts.size());
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    Set<Long> contentHash = new HashSet<>();
    for (WebDocument document : corpus.getDocuments()) {
        Long hash = document.getHash();
        if (contentHash.contains(hash)) {
            continue;
        }
        contentHash.add(hash);
        List<String> labelTags = new ArrayList<>();
        boolean labelFound = false;
        for (String label : document.getLabels()) {
            if (labelCounts.contains(label)) {
                labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
                labelFound = true;
            }
        }
        if (!labelFound) {
            continue;
        }
        String labelStr = String.join(" ", labelTags);
        String content = document.getContentAsString();
        String processed = processContent(morphology, content, useRoots);
        if (processed.length() < 200) {
            continue;
        }
        set.add("#" + document.getId() + " " + labelStr + " " + processed);
        if (c++ % 1000 == 0) {
            Log.info("%d processed.", c);
        }
    }
    Log.info("Generate train and test set.");
    Collections.shuffle(set, new Random(1));
    return new LinkedHashSet<>(set);
}
Also used : FastTextTrainer(zemberek.core.embeddings.FastTextTrainer) Stopwatch(com.google.common.base.Stopwatch) WebCorpus(zemberek.corpus.WebCorpus) Random(java.util.Random) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) WebDocument(zemberek.corpus.WebDocument) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) Locale(java.util.Locale) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) FastText(zemberek.core.embeddings.FastText) PrintWriter(java.io.PrintWriter) Args(zemberek.core.embeddings.Args) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Paths(java.nio.file.Paths) ScoredItem(zemberek.core.ScoredItem) Comparator(java.util.Comparator) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) WebDocument(zemberek.corpus.WebDocument) Random(java.util.Random) WebCorpus(zemberek.corpus.WebCorpus) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 84 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordSimilarityConsole method run.

void run(Path vectorFile, Path vocabFile) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    System.out.println("Loading from " + vectorFile);
    WordVectorLookup lookup = WordVectorLookup.loadFromBinaryFast(vectorFile, vocabFile);
    WordVectorLookup.DistanceMatcher distanceMatcher = new WordVectorLookup.DistanceMatcher(lookup);
    String input;
    System.out.println("Enter word:");
    Scanner sc = new Scanner(System.in);
    input = sc.nextLine();
    while (!input.equals("exit") && !input.equals("quit")) {
        if (!lookup.containsWord(input)) {
            Log.info(input + " cannot be found.");
            input = sc.nextLine();
            continue;
        }
        List<WordDistances.Distance> distances = distanceMatcher.nearestK(input, 30);
        List<String> dist = new ArrayList<>(distances.size());
        dist.addAll(distances.stream().map(d -> d.word).collect(Collectors.toList()));
        System.out.println(String.join(" ", dist));
        List<String> noParse = new ArrayList<>();
        for (String s : dist) {
            WordAnalysis an = morphology.analyze(s);
            if (an.isCorrect() || (an.analysisCount() == 1 && an.getAnalysisResults().get(0).getDictionaryItem().primaryPos == PrimaryPos.Unknown)) {
                noParse.add(s);
            }
        }
        System.out.println(String.join(" ", noParse));
        input = sc.nextLine();
    }
}
Also used : Scanner(java.util.Scanner) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 85 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class DictionaryOperations method saveRegular.

public static void saveRegular() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Set<String> set = new HashSet<>();
    for (DictionaryItem item : morphology.getLexicon()) {
        String lemma = item.lemma;
        if (item.attributes.contains(RootAttribute.Dummy)) {
            continue;
        }
        if (item.primaryPos == PrimaryPos.Punctuation) /*|| item.secondaryPos == SecondaryPos.ProperNoun
          || item.secondaryPos == SecondaryPos.Abbreviation*/
        {
            continue;
        }
        set.add(lemma);
        TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
        if (alphabet.containsCircumflex(lemma)) {
            set.add(alphabet.normalizeCircumflex(lemma));
        }
    }
    List<String> list = new ArrayList<>(set);
    list.sort(Turkish.STRING_COMPARATOR_ASC);
    Files.write(Paths.get("zemberek.vocab"), list);
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5