use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method guessRootsWithHeuristics.
@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
Log.info("Loading histogram.");
List<String> words = Files.readAllLines(wordFreqFile);
TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
// dictionaryLoader.load("elma");
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
Multimap<String, String> res = HashMultimap.create(100000, 3);
int c = 0;
for (String s : words) {
if (s.length() < 4) {
continue;
}
if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
continue;
}
for (int i = 2; i < s.length(); i++) {
String candidateRoot = s.substring(0, i + 1);
if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
continue;
}
List<DictionaryItem> items = new ArrayList<>(3);
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
char last = candidateRoot.charAt(candidateRoot.length() - 1);
if (i < s.length() - 1) {
char next = s.charAt(candidateRoot.length());
if (Turkish.Alphabet.isVowel(next)) {
String f = "";
if (last == 'b') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
} else if (last == 'c') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
} else if (last == 'ğ') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
}
if (last == 'd') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
}
if (f.length() > 0) {
items.add(TurkishDictionaryLoader.loadFromString(f));
}
}
}
for (DictionaryItem item : items) {
morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
WordAnalysis analyze = morphology.analyze(s);
for (SingleAnalysis wordAnalysis : analyze) {
if (!wordAnalysis.isUnknown()) {
res.put(candidateRoot, s);
}
}
morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
}
}
if (++c % 10000 == 0) {
Log.info(c);
}
if (c == 100000) {
break;
}
}
Log.info("Writing.");
try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
for (String root : res.keySet()) {
Collection<String> vals = res.get(root);
if (vals.size() < 2) {
continue;
}
List<String> wl = new ArrayList<>(vals);
wl.sort(turkishCollator::compare);
pw1.println(root + " : " + String.join(", ", vals));
pw2.println(root);
}
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class LoadProperNouns method main.
public static void main(String[] args) throws IOException {
TurkishMorphology parserGenerator = TurkishMorphology.createWithDefaults();
List<String> lines = Files.readAllLines(Paths.get("/home/afsina/Downloads/documents-export-2016-02-17/vocabulary-proper-full.tr.txt"));
Histogram<String> histogram = new Histogram<>();
Set<String> ignore = new HashSet<>(Files.readAllLines(Paths.get("morphology/src/main/resources/tr/proper-ignore")));
for (String line : lines) {
if (line.startsWith("_")) {
continue;
}
line = line.trim();
if (line.length() == 0) {
continue;
}
String word = Strings.subStringUntilFirst(line, " ");
int count = Integer.parseInt(Strings.subStringAfterFirst(line, " "));
word = Turkish.capitalize(word.substring(1));
if (count < 50) {
continue;
}
if (ignore.contains(word)) {
continue;
}
WordAnalysis parses = parserGenerator.analyze(word);
boolean found = false;
for (SingleAnalysis parse : parses) {
if (parse.getDictionaryItem().secondaryPos.equals(SecondaryPos.ProperNoun) && !parse.getDictionaryItem().hasAttribute(RootAttribute.Runtime)) {
found = true;
}
}
parserGenerator.invalidateCache();
if (found) {
continue;
}
if (word.length() < 4) {
continue;
}
histogram.add(word, count);
}
histogram.removeSmaller(165);
try (PrintWriter pw = new PrintWriter("proper")) {
histogram.getSortedList(Turkish.STRING_COMPARATOR_ASC).forEach(pw::println);
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method generateSetForLabelExperiment.
Set<String> generateSetForLabelExperiment(Path input, TurkishMorphology analyzer, boolean useRoots) throws IOException {
WebCorpus corpus = new WebCorpus("label", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Log.info("Extracting data.");
Histogram<String> labelCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
List<String> labels = document.getLabels();
List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
labelCounts.add(lowerCase);
}
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
Log.info("All label count = %d", labelCounts.size());
labelCounts.removeSmaller(15);
Log.info("Reduced label count = %d", labelCounts.size());
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
Set<Long> contentHash = new HashSet<>();
for (WebDocument document : corpus.getDocuments()) {
Long hash = document.getHash();
if (contentHash.contains(hash)) {
continue;
}
contentHash.add(hash);
List<String> labelTags = new ArrayList<>();
boolean labelFound = false;
for (String label : document.getLabels()) {
if (labelCounts.contains(label)) {
labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
labelFound = true;
}
}
if (!labelFound) {
continue;
}
String labelStr = String.join(" ", labelTags);
String content = document.getContentAsString();
String processed = processContent(morphology, content, useRoots);
if (processed.length() < 200) {
continue;
}
set.add("#" + document.getId() + " " + labelStr + " " + processed);
if (c++ % 1000 == 0) {
Log.info("%d processed.", c);
}
}
Log.info("Generate train and test set.");
Collections.shuffle(set, new Random(1));
return new LinkedHashSet<>(set);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class WordSimilarityConsole method run.
void run(Path vectorFile, Path vocabFile) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
System.out.println("Loading from " + vectorFile);
WordVectorLookup lookup = WordVectorLookup.loadFromBinaryFast(vectorFile, vocabFile);
WordVectorLookup.DistanceMatcher distanceMatcher = new WordVectorLookup.DistanceMatcher(lookup);
String input;
System.out.println("Enter word:");
Scanner sc = new Scanner(System.in);
input = sc.nextLine();
while (!input.equals("exit") && !input.equals("quit")) {
if (!lookup.containsWord(input)) {
Log.info(input + " cannot be found.");
input = sc.nextLine();
continue;
}
List<WordDistances.Distance> distances = distanceMatcher.nearestK(input, 30);
List<String> dist = new ArrayList<>(distances.size());
dist.addAll(distances.stream().map(d -> d.word).collect(Collectors.toList()));
System.out.println(String.join(" ", dist));
List<String> noParse = new ArrayList<>();
for (String s : dist) {
WordAnalysis an = morphology.analyze(s);
if (an.isCorrect() || (an.analysisCount() == 1 && an.getAnalysisResults().get(0).getDictionaryItem().primaryPos == PrimaryPos.Unknown)) {
noParse.add(s);
}
}
System.out.println(String.join(" ", noParse));
input = sc.nextLine();
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class DictionaryOperations method saveRegular.
public static void saveRegular() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Set<String> set = new HashSet<>();
for (DictionaryItem item : morphology.getLexicon()) {
String lemma = item.lemma;
if (item.attributes.contains(RootAttribute.Dummy)) {
continue;
}
if (item.primaryPos == PrimaryPos.Punctuation) /*|| item.secondaryPos == SecondaryPos.ProperNoun
|| item.secondaryPos == SecondaryPos.Abbreviation*/
{
continue;
}
set.add(lemma);
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
if (alphabet.containsCircumflex(lemma)) {
set.add(alphabet.normalizeCircumflex(lemma));
}
}
List<String> list = new ArrayList<>(set);
list.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("zemberek.vocab"), list);
}
Aggregations