use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.
the class HarkavyLexiconReader method main.
public static void main(String[] args) throws Exception {
long startTime = (new Date()).getTime();
try {
String command = args[0];
if (command.equals("load")) {
HarkavyLexiconReader reader = new HarkavyLexiconReader();
File file = new File(args[1]);
Writer variantWriter = null;
if (args.length > 2) {
File variantFile = new File(args[2]);
variantFile.delete();
variantWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(variantFile, true), "UTF8"));
}
reader.setVariantWriter(variantWriter);
try {
TextFileLexicon lexicon = reader.read(file);
if (args.length > 3) {
File lexiconFile = new File(args[3]);
lexicon.serialize(lexiconFile);
}
} finally {
if (variantWriter != null)
variantWriter.close();
}
} else if (command.equals("deserialise")) {
File memoryBaseFile = new File(args[1]);
String word = args[2];
Lexicon lexicon = TextFileLexicon.deserialize(memoryBaseFile);
LOG.debug("Have entry " + word + ": " + lexicon.getFrequency(word));
} else {
throw new RuntimeException("Unknown command: " + command);
}
} finally {
long endTime = (new Date()).getTime() - startTime;
LOG.debug("Total runtime: " + ((double) endTime / 1000) + " seconds");
}
}
use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.
the class NiborskiLexiconReader method read.
public TextFileLexicon read(File file) throws IOException {
Scanner scanner = new Scanner(file);
NiborskiLexiconEntry entry = null;
NiborskiLexiconSubEntry subEntry = null;
int i = 0;
boolean skipSubEntry = false;
boolean skipEntry = false;
boolean inComment = false;
while (scanner.hasNextLine()) {
String line = scanner.nextLine().trim();
LOG.debug(line);
if (line.startsWith("<!--")) {
inComment = true;
} else if (line.startsWith("-->"))
inComment = false;
if (inComment)
continue;
if (line.startsWith("<entry>") || line.startsWith("<entry ")) {
if (maxEntries > 0 && i > maxEntries)
break;
i++;
if (entry != null) {
scanner.close();
throw new RuntimeException("Entry not properly ended: " + entry.text);
}
entry = new NiborskiLexiconEntry();
skipEntry = false;
} else if (line.startsWith("</entry>")) {
if (!skipEntry) {
LOG.debug("" + i);
if (entry == null) {
scanner.close();
throw new RuntimeException("Entry not properly started");
}
this.addEntry(entry);
for (NiborskiLexiconSubEntry oneSubEntry : entry.subEntries) {
if (oneSubEntry.xref.length() > 0 && !oneSubEntry.category.equals("xref")) {
NiborskiLexiconEntry xrefEntry = new NiborskiLexiconEntry();
xrefEntry.text = oneSubEntry.xref;
NiborskiLexiconSubEntry xrefSubEntry = new NiborskiLexiconSubEntry();
xrefSubEntry.category = oneSubEntry.category;
xrefSubEntry.gender = oneSubEntry.gender;
xrefSubEntry.notes = oneSubEntry.notes;
xrefEntry.subEntries.add(xrefSubEntry);
this.addEntry(xrefEntry);
}
}
}
entry = null;
} else if (line.startsWith("<text>")) {
int tagLength = "<text>".length();
entry.text = line.substring(tagLength, line.indexOf('<', tagLength + 1));
int pronunciationIndex = line.indexOf("<pronunciation>");
if (pronunciationIndex >= 0) {
tagLength = "<pronunciation>".length();
entry.pronunciation = line.substring(pronunciationIndex + tagLength, line.indexOf('<', pronunciationIndex + tagLength + 1));
}
int superscriptIndex = line.indexOf("<sup>");
if (superscriptIndex >= 0) {
tagLength = "<sup>".length();
entry.superscript = Integer.parseInt(line.substring(superscriptIndex + tagLength, line.indexOf('<', superscriptIndex + tagLength + 1)));
}
if (entry.text.contains("…"))
skipEntry = true;
} else if (line.startsWith("<subentry>")) {
subEntry = new NiborskiLexiconSubEntry();
skipSubEntry = false;
} else if (line.startsWith("</subentry>")) {
if (!skipSubEntry) {
if (subEntry.lemma != null)
subEntry.lemma = subEntry.lemma.replaceAll("עַ", "ע");
entry.subEntries.add(subEntry);
}
subEntry = null;
} else if (line.startsWith("<subentry skip=\"fr\">")) {
subEntry = new NiborskiLexiconSubEntry();
skipSubEntry = true;
} else if (line.startsWith("<subentry skip=\"en\">")) {
subEntry = new NiborskiLexiconSubEntry();
skipSubEntry = false;
} else if (line.startsWith("<category>")) {
int tagLength = "<category>".length();
String category = "";
if (line.indexOf('<', tagLength + 1) >= 0)
category = line.substring(tagLength, line.indexOf('<', tagLength + 1));
else
category = line.substring(tagLength);
if (category.equals("פּראָנ—אַק/דאַט"))
category = "פּראָנ—אַק";
subEntry.category = category;
if (line.contains("<form")) {
int typeIndex = line.indexOf("<form type=\"") + 12;
String type = line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
int formIndex = line.indexOf('>', typeIndex + 1) + 1;
if (line.indexOf('<', formIndex + 1) >= 0) {
String form = line.substring(formIndex, line.indexOf('<', formIndex + 1));
subEntry.forms.put(type, form);
} else {
subEntry.forms.put(type, "");
}
int pronunciationIndex = line.indexOf("<pronunciation>");
if (pronunciationIndex >= 0) {
tagLength = "<pronunciation>".length();
subEntry.formPronunciations.put(type, line.substring(pronunciationIndex + tagLength, line.indexOf('<', pronunciationIndex + tagLength + 1)));
}
}
} else if (line.startsWith("<noun")) {
subEntry.category = "noun";
} else if (line.startsWith("<verb")) {
subEntry.category = "verb";
} else if (line.startsWith("<lemma>")) {
int tagLength = "<lemma>".length();
subEntry.lemma = line.substring(tagLength, line.indexOf('<', tagLength + 1));
} else if (line.startsWith("<gender")) {
int typeIndex = line.indexOf("type=\"") + 6;
subEntry.gender += line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
} else if (line.startsWith("<xref")) {
int xrefIndex = line.indexOf('>', 4) + 1;
subEntry.xref = line.substring(xrefIndex, line.indexOf('<', xrefIndex + 1));
if (subEntry.category.length() == 0)
subEntry.category = "xref";
} else if (line.startsWith("<form")) {
int typeIndex = line.indexOf("type=\"") + 6;
String type = line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
int formIndex = line.indexOf('>', typeIndex + 1) + 1;
if (line.indexOf('<', formIndex + 1) >= 0) {
String form = line.substring(formIndex, line.indexOf('<', formIndex + 1));
subEntry.forms.put(type, form);
} else {
subEntry.forms.put(type, "");
}
int pronunciationIndex = line.indexOf("<pronunciation>");
if (pronunciationIndex >= 0) {
int tagLength = "<pronunciation>".length();
subEntry.formPronunciations.put(type, line.substring(pronunciationIndex + tagLength, line.indexOf('<', pronunciationIndex + tagLength + 1)));
}
} else if (line.startsWith("<note")) {
int typeIndex = line.indexOf("type=\"") + 6;
String type = line.substring(typeIndex, line.indexOf('"', typeIndex + 1));
subEntry.notes.add(type);
}
}
scanner.close();
Pattern punctuation = Pattern.compile("\\p{Punct}");
for (NiborskiLexicalFormEntry partEntry : partEntries) {
String partText = partEntry.text;
if (!punctuation.matcher(partText).matches()) {
if (!entries.contains(partText)) {
allVariants.add(partEntry);
entries.add(partText);
}
}
}
if (this.variantWriter != null) {
for (NiborskiLexicalFormEntry variant : allVariants) {
variantWriter.write(variant.toString() + "\n");
variantWriter.flush();
}
}
TextFileLexicon lexicon = new TextFileLexicon();
for (String word : entries) {
lexicon.setEntry(word, 1);
}
return lexicon;
}
use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.
the class PlaceListReader method read.
public TextFileLexicon read(File file) throws IOException {
Scanner scanner = new Scanner(file);
try {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if (!line.startsWith("#")) {
String[] names = line.split(" ");
List<String> nameList = new ArrayList<>(names.length);
for (String name : names) {
if (name.trim().length() > 0) {
nameList.add(name.trim());
}
}
String attributes = defaultAttribute;
if (nameList.size() > 1) {
attributes += ",@partOf(" + line + ")";
}
for (int i = 0; i < nameList.size(); i++) {
String name = nameList.get(i);
PlaceLexicalEntry entry = new PlaceLexicalEntry(name, "np", name, "s", attributes);
allVariants.add(entry);
entries.add(entry.text);
if (i == nameList.size() - 1) {
String radical = YiddishTextUtils.removeEndForm(entry.text);
String possessiveForm = radical;
if (radical.endsWith("ס") || radical.endsWith("ש") || radical.endsWith("צ") || radical.endsWith("ת")) {
possessiveForm += "עס";
} else {
possessiveForm += "ס";
}
PlaceLexicalEntry possessiveEntry = new PlaceLexicalEntry(possessiveForm, "np", name, "s", attributes + ",@poss");
allVariants.add(possessiveEntry);
entries.add(possessiveEntry.text);
}
}
}
}
if (this.variantWriter != null) {
for (PlaceLexicalEntry variant : allVariants) {
variantWriter.write(variant.toString() + "\n");
variantWriter.flush();
}
}
TextFileLexicon lexicon = new TextFileLexicon();
for (String word : entries) lexicon.setEntry(word, 1);
return lexicon;
} finally {
scanner.close();
}
}
use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.
the class HarkavyLexiconReader method read.
public TextFileLexicon read(File file) throws IOException {
Scanner scanner = new Scanner(file);
try {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if (!line.startsWith("#")) {
String[] parts = line.split("\t");
HarkavyLexicalFormEntry entry = new HarkavyLexicalFormEntry(parts[0], parts[1], parts[2], parts[3], parts[4]);
allVariants.add(entry);
entries.add(entry.text);
String radical = YiddishTextUtils.removeEndForm(entry.text);
String possessiveForm = radical;
if (radical.endsWith("ס") || radical.endsWith("ש") || radical.endsWith("צ") || radical.endsWith("ת")) {
possessiveForm += "עס";
} else {
possessiveForm += "ס";
}
if (!exceptions.contains(possessiveForm)) {
HarkavyLexicalFormEntry possessiveEntry = new HarkavyLexicalFormEntry(possessiveForm, parts[1], parts[2], parts[3], parts[4] + ",@poss");
allVariants.add(possessiveEntry);
entries.add(possessiveEntry.text);
}
String accusativeForm = radical;
if (radical.endsWith("ל") || radical.endsWith("מ") || radical.endsWith("נ")) {
accusativeForm += "ען";
} else {
accusativeForm += "ן";
}
if (!exceptions.contains(accusativeForm)) {
HarkavyLexicalFormEntry accusativeEntry = new HarkavyLexicalFormEntry(accusativeForm, parts[1], parts[2], parts[3], parts[4] + ",@acc,@dat");
allVariants.add(accusativeEntry);
entries.add(accusativeEntry.text);
}
}
}
if (this.variantWriter != null) {
for (HarkavyLexicalFormEntry variant : allVariants) {
variantWriter.write(variant.toString() + "\n");
variantWriter.flush();
}
}
TextFileLexicon lexicon = new TextFileLexicon();
for (String word : entries) lexicon.setEntry(word, 1);
return lexicon;
} finally {
scanner.close();
}
}
use of com.joliciel.jochre.lexicon.TextFileLexicon in project jochre by urieli.
the class NameListReader method main.
public static void main(String[] args) throws Exception {
long startTime = (new Date()).getTime();
try {
String command = args[0];
if (command.equals("load")) {
NameListReader reader = new NameListReader();
File file = new File(args[1]);
Writer variantWriter = null;
if (args.length > 2) {
File variantFile = new File(args[2]);
variantFile.delete();
variantWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(variantFile, true), "UTF8"));
}
reader.setVariantWriter(variantWriter);
reader.defaultAttribute = args[4];
try {
TextFileLexicon lexicon = reader.read(file);
if (args.length > 3) {
File lexiconFile = new File(args[3]);
lexicon.serialize(lexiconFile);
}
} finally {
if (variantWriter != null)
variantWriter.close();
}
} else if (command.equals("deserialise")) {
File memoryBaseFile = new File(args[1]);
Lexicon lexicon = TextFileLexicon.deserialize(memoryBaseFile);
String[] words = new String[] { "חײמס", "חױמס" };
for (String word : words) LOG.debug("Have entry " + word + ": " + lexicon.getFrequency(word));
} else {
throw new RuntimeException("Unknown command: " + command);
}
} finally {
long endTime = (new Date()).getTime() - startTime;
LOG.debug("Total runtime: " + ((double) endTime / 1000) + " seconds");
}
}
Aggregations