use of java.util.regex.Pattern in project CoreNLP by stanfordnlp.
the class Units method loadUnits.
public static List<Unit> loadUnits(String filename) throws IOException {
Pattern commaPattern = Pattern.compile("\\s*,\\s*");
BufferedReader br = IOUtils.getBufferedFileReader(filename);
String headerString = br.readLine();
String[] header = commaPattern.split(headerString);
Map<String, Integer> headerIndex = new HashMap<>();
for (int i = 0; i < header.length; i++) {
headerIndex.put(header[i], i);
}
int iName = headerIndex.get("unit");
int iPrefix = headerIndex.get("prefix");
int iSymbol = headerIndex.get("symbol");
int iType = headerIndex.get("type");
int iSystem = headerIndex.get("system");
int iDefaultUnit = headerIndex.get("defaultUnit");
int iDefaultUnitScale = headerIndex.get("defaultUnitScale");
String line;
List<Unit> list = new ArrayList<>();
Map<String, Unit> unitsByName = new HashMap<>();
Map<String, Pair<String, Double>> unitToDefaultUnits = new HashMap<>();
while ((line = br.readLine()) != null) {
String[] fields = commaPattern.split(line);
Unit unit = new Unit(fields[iName], fields[iSymbol], fields[iType].toUpperCase());
unit.system = fields[iSystem];
if (fields.length > iPrefix) {
unit.prefixSystem = fields[iPrefix];
}
if (fields.length > iDefaultUnit) {
double scale = 1.0;
if (fields.length > iDefaultUnitScale) {
scale = Double.parseDouble(fields[iDefaultUnitScale]);
}
unitToDefaultUnits.put(unit.getName(), Pair.makePair(fields[iDefaultUnit], scale));
}
unitsByName.put(unit.getName(), unit);
list.add(unit);
}
for (Map.Entry<String, Pair<String, Double>> entry : unitToDefaultUnits.entrySet()) {
Unit unit = unitsByName.get(entry.getKey());
Unit defaultUnit = unitsByName.get(entry.getValue().first);
if (defaultUnit != null) {
unit.defaultUnit = defaultUnit;
unit.defaultUnitScale = entry.getValue().second;
} else {
Redwood.Util.warn("Unknown default unit " + entry.getValue().first + " for " + entry.getKey());
}
}
br.close();
return list;
}
use of java.util.regex.Pattern in project CoreNLP by stanfordnlp.
the class CMMClassifier method getThresholds.
private static List<Pair<Pattern, Integer>> getThresholds(String filename) {
BufferedReader in = null;
try {
in = IOUtils.readerFromString(filename);
List<Pair<Pattern, Integer>> thresholds = new ArrayList<>();
for (String line; (line = in.readLine()) != null; ) {
int i = line.lastIndexOf(' ');
Pattern p = Pattern.compile(line.substring(0, i));
//log.info(":"+line.substring(0,i)+":");
Integer t = Integer.valueOf(line.substring(i + 1));
Pair<Pattern, Integer> pair = new Pair<>(p, t);
thresholds.add(pair);
}
in.close();
return thresholds;
} catch (IOException e) {
throw new RuntimeIOException("Error reading threshold file", e);
} finally {
IOUtils.closeIgnoringExceptions(in);
}
}
use of java.util.regex.Pattern in project CoreNLP by stanfordnlp.
the class MultiWordStringMatcher method getPattern.
public Pattern getPattern(String targetString) {
Pattern pattern = targetStringPatternCache.get(targetString);
if (pattern == null) {
pattern = createPattern(targetString);
targetStringPatternCache.put(targetString, pattern);
}
return pattern;
}
use of java.util.regex.Pattern in project CoreNLP by stanfordnlp.
the class TreeToTSV method main.
public static void main(String[] args) {
if (args.length < 1) {
System.err.printf("Usage: java %s tree_file%n", TreeToTSV.class.getName());
System.exit(-1);
}
String treeFile = args[0];
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
StringBuilder sb = new StringBuilder();
String nl = System.getProperty("line.separator");
Pattern nePattern = Pattern.compile("^grup\\.nom\\.");
Pattern npPattern = Pattern.compile("^np0000.$");
for (Tree tree; (tree = tr.readTree()) != null; ) {
for (Tree t : tree) {
if (!t.isPreTerminal())
continue;
char type = 'O';
Tree grandma = t.ancestor(1, tree);
String grandmaValue = ((CoreLabel) grandma.label()).value();
// grup.nom.x
if (nePattern.matcher(grandmaValue).find())
type = grandmaValue.charAt(9);
else // else check the pos for np0000x or not
{
String pos = ((CoreLabel) t.label()).value();
if (npPattern.matcher(pos).find())
type = pos.charAt(6);
}
Tree wordNode = t.firstChild();
String word = ((CoreLabel) wordNode.label()).value();
sb.append(word).append("\t");
switch(type) {
case 'p':
sb.append("PERS");
break;
case 'l':
sb.append("LUG");
break;
case 'o':
sb.append("ORG");
break;
case '0':
sb.append("OTROS");
break;
default:
sb.append("O");
}
sb.append(nl);
}
sb.append(nl);
}
System.out.print(sb.toString());
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of java.util.regex.Pattern in project CoreNLP by stanfordnlp.
the class ChineseSimWordAvgDepGrammar method getMap.
public Map<Pair<Integer, String>, List<Triple<Integer, String, Double>>> getMap(String filename) {
Map<Pair<Integer, String>, List<Triple<Integer, String, Double>>> hashMap = Generics.newHashMap();
try {
BufferedReader wordMapBReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
String wordMapLine;
Pattern linePattern = Pattern.compile("sim\\((.+)/(.+):(.+)/(.+)\\)=(.+)");
while ((wordMapLine = wordMapBReader.readLine()) != null) {
Matcher m = linePattern.matcher(wordMapLine);
if (!m.matches()) {
log.info("Ill-formed line in similar word map file: " + wordMapLine);
continue;
}
Pair<Integer, String> iTW = new Pair<>(wordIndex.addToIndex(m.group(1)), m.group(2));
double score = Double.parseDouble(m.group(5));
List<Triple<Integer, String, Double>> tripleList = hashMap.get(iTW);
if (tripleList == null) {
tripleList = new ArrayList<>();
hashMap.put(iTW, tripleList);
}
tripleList.add(new Triple<>(wordIndex.addToIndex(m.group(3)), m.group(4), score));
}
} catch (IOException e) {
throw new RuntimeException("Problem reading similar words file!");
}
return hashMap;
}
Aggregations