Search in sources :

Example 1 with TextFileLexicon

use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.

the class HarkavyLexiconReader method main.

public static void main(String[] args) throws Exception {
    long startTime = (new Date()).getTime();
    try {
        String command = args[0];
        if (command.equals("load")) {
            HarkavyLexiconReader reader = new HarkavyLexiconReader();
            File file = new File(args[1]);
            Writer variantWriter = null;
            if (args.length > 2) {
                File variantFile = new File(args[2]);
                variantWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(variantFile, true), "UTF8"));
            try {
                TextFileLexicon lexicon =;
                if (args.length > 3) {
                    File lexiconFile = new File(args[3]);
            } finally {
                if (variantWriter != null)
        } else if (command.equals("deserialise")) {
            File memoryBaseFile = new File(args[1]);
            String word = args[2];
            Lexicon lexicon = TextFileLexicon.deserialize(memoryBaseFile);
            LOG.debug("Have entry " + word + ": " + lexicon.getFrequency(word));
        } else {
            throw new RuntimeException("Unknown command: " + command);
    } finally {
        long endTime = (new Date()).getTime() - startTime;
        LOG.debug("Total runtime: " + ((double) endTime / 1000) + " seconds");
Also used : FileOutputStream( TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) Lexicon(com.joliciel.jochre.lexicon.Lexicon) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) OutputStreamWriter( File( Date(java.util.Date) BufferedWriter( Writer( OutputStreamWriter( BufferedWriter(

Example 2 with TextFileLexicon

use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.

the class NiborskiLexiconReader method read.

public TextFileLexicon read(File file) throws IOException {
    Scanner scanner = new Scanner(file);
    NiborskiLexiconEntry entry = null;
    NiborskiLexiconSubEntry subEntry = null;
    int i = 0;
    boolean skipSubEntry = false;
    boolean skipEntry = false;
    boolean inComment = false;
    while (scanner.hasNextLine()) {
        String line = scanner.nextLine().trim();
        if (line.startsWith("<!--")) {
            inComment = true;
        } else if (line.startsWith("-->"))
            inComment = false;
        if (inComment)
        if (line.startsWith("<entry>") || line.startsWith("<entry ")) {
            if (maxEntries > 0 && i > maxEntries)
            if (entry != null) {
                throw new RuntimeException("Entry not properly ended: " + entry.text);
            entry = new NiborskiLexiconEntry();
            skipEntry = false;
        } else if (line.startsWith("</entry>")) {
            if (!skipEntry) {
                LOG.debug("" + i);
                if (entry == null) {
                    throw new RuntimeException("Entry not properly started");
                for (NiborskiLexiconSubEntry oneSubEntry : entry.subEntries) {
                    if (oneSubEntry.xref.length() > 0 && !oneSubEntry.category.equals("xref")) {
                        NiborskiLexiconEntry xrefEntry = new NiborskiLexiconEntry();
                        xrefEntry.text = oneSubEntry.xref;
                        NiborskiLexiconSubEntry xrefSubEntry = new NiborskiLexiconSubEntry();
                        xrefSubEntry.category = oneSubEntry.category;
                        xrefSubEntry.gender = oneSubEntry.gender;
                        xrefSubEntry.notes = oneSubEntry.notes;
            entry = null;
        } else if (line.startsWith("<text>")) {
            int tagLength = "<text>".length();
            entry.text = line.substring(tagLength, line.indexOf('<', tagLength + 1));
            int pronunciationIndex = line.indexOf("<pronunciation>");
            if (pronunciationIndex >= 0) {
                tagLength = "<pronunciation>".length();
                entry.pronunciation = line.substring(pronunciationIndex + tagLength, line.indexOf('<', pronunciationIndex + tagLength + 1));
            int superscriptIndex = line.indexOf("<sup>");
            if (superscriptIndex >= 0) {
                tagLength = "<sup>".length();
                entry.superscript = Integer.parseInt(line.substring(superscriptIndex + tagLength, line.indexOf('<', superscriptIndex + tagLength + 1)));
            if (entry.text.contains("…"))
                skipEntry = true;
        } else if (line.startsWith("<subentry>")) {
            subEntry = new NiborskiLexiconSubEntry();
            skipSubEntry = false;
        } else if (line.startsWith("</subentry>")) {
            if (!skipSubEntry) {
                if (subEntry.lemma != null)
                    subEntry.lemma = subEntry.lemma.replaceAll("עַ", "ע");
            subEntry = null;
        } else if (line.startsWith("<subentry skip=\"fr\">")) {
            subEntry = new NiborskiLexiconSubEntry();
            skipSubEntry = true;
        } else if (line.startsWith("<subentry skip=\"en\">")) {
            subEntry = new NiborskiLexiconSubEntry();
            skipSubEntry = false;
        } else if (line.startsWith("<category>")) {
            int tagLength = "<category>".length();
            String category = "";
            if (line.indexOf('<', tagLength + 1) >= 0)
                category = line.substring(tagLength, line.indexOf('<', tagLength + 1));
                category = line.substring(tagLength);
            if (category.equals("פּראָנ—אַק/דאַט"))
                category = "פּראָנ—אַק";
            subEntry.category = category;
            if (line.contains("<form")) {
                int typeIndex = line.indexOf("<form type=\"") + 12;
                String type = line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
                int formIndex = line.indexOf('>', typeIndex + 1) + 1;
                if (line.indexOf('<', formIndex + 1) >= 0) {
                    String form = line.substring(formIndex, line.indexOf('<', formIndex + 1));
                    subEntry.forms.put(type, form);
                } else {
                    subEntry.forms.put(type, "");
                int pronunciationIndex = line.indexOf("<pronunciation>");
                if (pronunciationIndex >= 0) {
                    tagLength = "<pronunciation>".length();
                    subEntry.formPronunciations.put(type, line.substring(pronunciationIndex + tagLength, line.indexOf('<', pronunciationIndex + tagLength + 1)));
        } else if (line.startsWith("<noun")) {
            subEntry.category = "noun";
        } else if (line.startsWith("<verb")) {
            subEntry.category = "verb";
        } else if (line.startsWith("<lemma>")) {
            int tagLength = "<lemma>".length();
            subEntry.lemma = line.substring(tagLength, line.indexOf('<', tagLength + 1));
        } else if (line.startsWith("<gender")) {
            int typeIndex = line.indexOf("type=\"") + 6;
            subEntry.gender += line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
        } else if (line.startsWith("<xref")) {
            int xrefIndex = line.indexOf('>', 4) + 1;
            subEntry.xref = line.substring(xrefIndex, line.indexOf('<', xrefIndex + 1));
            if (subEntry.category.length() == 0)
                subEntry.category = "xref";
        } else if (line.startsWith("<form")) {
            int typeIndex = line.indexOf("type=\"") + 6;
            String type = line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
            int formIndex = line.indexOf('>', typeIndex + 1) + 1;
            if (line.indexOf('<', formIndex + 1) >= 0) {
                String form = line.substring(formIndex, line.indexOf('<', formIndex + 1));
                subEntry.forms.put(type, form);
            } else {
                subEntry.forms.put(type, "");
            int pronunciationIndex = line.indexOf("<pronunciation>");
            if (pronunciationIndex >= 0) {
                int tagLength = "<pronunciation>".length();
                subEntry.formPronunciations.put(type, line.substring(pronunciationIndex + tagLength, line.indexOf('<', pronunciationIndex + tagLength + 1)));
        } else if (line.startsWith("<note")) {
            int typeIndex = line.indexOf("type=\"") + 6;
            String type = line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
    Pattern punctuation = Pattern.compile("\\p{Punct}");
    for (NiborskiLexicalFormEntry partEntry : partEntries) {
        String partText = partEntry.text;
        if (!punctuation.matcher(partText).matches()) {
            if (!entries.contains(partText)) {
    if (this.variantWriter != null) {
        for (NiborskiLexicalFormEntry variant : allVariants) {
            variantWriter.write(variant.toString() + "\n");
    TextFileLexicon lexicon = new TextFileLexicon();
    for (String word : entries) {
        lexicon.setEntry(word, 1);
    return lexicon;
Also used : Scanner(java.util.Scanner) Pattern(java.util.regex.Pattern) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon)

Example 3 with TextFileLexicon

use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.

the class PlaceListReader method read.

public TextFileLexicon read(File file) throws IOException {
    Scanner scanner = new Scanner(file);
    try {
        while (scanner.hasNextLine()) {
            String line = scanner.nextLine();
            if (!line.startsWith("#")) {
                String[] names = line.split(" ");
                List<String> nameList = new ArrayList<>(names.length);
                for (String name : names) {
                    if (name.trim().length() > 0) {
                String attributes = defaultAttribute;
                if (nameList.size() > 1) {
                    attributes += ",@partOf(" + line + ")";
                for (int i = 0; i < nameList.size(); i++) {
                    String name = nameList.get(i);
                    PlaceLexicalEntry entry = new PlaceLexicalEntry(name, "np", name, "s", attributes);
                    if (i == nameList.size() - 1) {
                        String radical = YiddishTextUtils.removeEndForm(entry.text);
                        String possessiveForm = radical;
                        if (radical.endsWith("ס") || radical.endsWith("ש") || radical.endsWith("צ") || radical.endsWith("ת")) {
                            possessiveForm += "עס";
                        } else {
                            possessiveForm += "ס";
                        PlaceLexicalEntry possessiveEntry = new PlaceLexicalEntry(possessiveForm, "np", name, "s", attributes + ",@poss");
        if (this.variantWriter != null) {
            for (PlaceLexicalEntry variant : allVariants) {
                variantWriter.write(variant.toString() + "\n");
        TextFileLexicon lexicon = new TextFileLexicon();
        for (String word : entries) lexicon.setEntry(word, 1);
        return lexicon;
    } finally {
Also used : Scanner(java.util.Scanner) ArrayList(java.util.ArrayList) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon)

Example 4 with TextFileLexicon

use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.

the class HarkavyLexiconReader method read.

public TextFileLexicon read(File file) throws IOException {
    Scanner scanner = new Scanner(file);
    try {
        while (scanner.hasNextLine()) {
            String line = scanner.nextLine();
            if (!line.startsWith("#")) {
                String[] parts = line.split("\t");
                HarkavyLexicalFormEntry entry = new HarkavyLexicalFormEntry(parts[0], parts[1], parts[2], parts[3], parts[4]);
                String radical = YiddishTextUtils.removeEndForm(entry.text);
                String possessiveForm = radical;
                if (radical.endsWith("ס") || radical.endsWith("ש") || radical.endsWith("צ") || radical.endsWith("ת")) {
                    possessiveForm += "עס";
                } else {
                    possessiveForm += "ס";
                if (!exceptions.contains(possessiveForm)) {
                    HarkavyLexicalFormEntry possessiveEntry = new HarkavyLexicalFormEntry(possessiveForm, parts[1], parts[2], parts[3], parts[4] + ",@poss");
                String accusativeForm = radical;
                if (radical.endsWith("ל") || radical.endsWith("מ") || radical.endsWith("נ")) {
                    accusativeForm += "ען";
                } else {
                    accusativeForm += "ן";
                if (!exceptions.contains(accusativeForm)) {
                    HarkavyLexicalFormEntry accusativeEntry = new HarkavyLexicalFormEntry(accusativeForm, parts[1], parts[2], parts[3], parts[4] + ",@acc,@dat");
        if (this.variantWriter != null) {
            for (HarkavyLexicalFormEntry variant : allVariants) {
                variantWriter.write(variant.toString() + "\n");
        TextFileLexicon lexicon = new TextFileLexicon();
        for (String word : entries) lexicon.setEntry(word, 1);
        return lexicon;
    } finally {
Also used : Scanner(java.util.Scanner) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon)

Example 5 with TextFileLexicon

use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.

the class NameListReader method main.

public static void main(String[] args) throws Exception {
    long startTime = (new Date()).getTime();
    try {
        String command = args[0];
        if (command.equals("load")) {
            NameListReader reader = new NameListReader();
            File file = new File(args[1]);
            Writer variantWriter = null;
            if (args.length > 2) {
                File variantFile = new File(args[2]);
                variantWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(variantFile, true), "UTF8"));
            reader.defaultAttribute = args[4];
            try {
                TextFileLexicon lexicon =;
                if (args.length > 3) {
                    File lexiconFile = new File(args[3]);
            } finally {
                if (variantWriter != null)
        } else if (command.equals("deserialise")) {
            File memoryBaseFile = new File(args[1]);
            Lexicon lexicon = TextFileLexicon.deserialize(memoryBaseFile);
            String[] words = new String[] { "חײמס", "חױמס" };
            for (String word : words) LOG.debug("Have entry " + word + ": " + lexicon.getFrequency(word));
        } else {
            throw new RuntimeException("Unknown command: " + command);
    } finally {
        long endTime = (new Date()).getTime() - startTime;
        LOG.debug("Total runtime: " + ((double) endTime / 1000) + " seconds");
Also used : FileOutputStream( TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) Lexicon(com.joliciel.jochre.lexicon.Lexicon) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) OutputStreamWriter( File( Date(java.util.Date) BufferedWriter( Writer( OutputStreamWriter( BufferedWriter(


TextFileLexicon (com.joliciel.jochre.lexicon.TextFileLexicon)11 File ( BufferedWriter ( FileOutputStream ( OutputStreamWriter ( Writer ( Lexicon (com.joliciel.jochre.lexicon.Lexicon)5 Scanner (java.util.Scanner)5 Date (java.util.Date)4 CorpusLexiconBuilder (com.joliciel.jochre.lexicon.CorpusLexiconBuilder)2 LexiconErrorWriter (com.joliciel.jochre.lexicon.LexiconErrorWriter)2 UnknownWordListWriter (com.joliciel.jochre.lexicon.UnknownWordListWriter)2 IOException ( BeamSearchImageAnalyser (com.joliciel.jochre.analyser.BeamSearchImageAnalyser)1 ErrorLogger (com.joliciel.jochre.analyser.ErrorLogger)1 FScoreObserver (com.joliciel.jochre.analyser.FScoreObserver)1 ImageAnalyser (com.joliciel.jochre.analyser.ImageAnalyser)1 LetterAssigner (com.joliciel.jochre.analyser.LetterAssigner)1 LetterGuessObserver (com.joliciel.jochre.analyser.LetterGuessObserver)1 OriginalShapeLetterAssigner (com.joliciel.jochre.analyser.OriginalShapeLetterAssigner)1