use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class ChineseTokenizer method main.
public static void main(String[] args) {
String line = " 面对新世纪, 世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果。克服20世纪困扰着人类的战争和贫困问题,推进和平与发展的崇高事业,创造一个美好的世界。";
// line = "2006年大西洋颶風季時間軸中記錄有全年大西洋盆地所有熱帶和亞熱帶氣旋形成、增強、減弱、登陸、轉變成溫帶氣旋以及消散的具體信息。2006年大西洋颶風季於2006年6月1日正式開始,同年11月30日結束,傳統上這樣的日期界定了一年中絕大多數熱帶氣旋在大西洋形成的時間段,這一颶風季是繼2001年大西洋颶風季以來第一個沒有任何一場颶風在美國登陸的大西洋颶風季,也是繼1994年大西洋颶風季以來第一次在整個十月份都沒有熱帶氣旋形成。美國國家颶風中心每年都會對前一年颶風季的所有天氣系統進行重新分析,並根據結果更新其風暴資料庫,因此時間軸中還包括實際操作中沒有發布的信息。包括最大持續風速、位置、距離在內的所有數字都是經四捨五入換算成整數。";
line = "巴拉克 歐巴馬";
line = "ab-cde";
// line = "在古巴的美国代表机构是由哈瓦那的United States Interests Section(美国利益科)代理,在美国首都华盛顿有一个类似的Cuban Interests Section(古巴利益科),其则是瑞士大使馆的组成部分。";
String basedir = "/shared/experiments/ctsai12/workspace/stanford-segmenter-2015-04-20/data/";
ChineseTokenizer ct = new ChineseTokenizer(basedir);
TextAnnotation ta = ct.getTextAnnotation1(line);
for (String t : ta.getTokens()) System.out.println(t);
// int tid = ta.getTokenIdFromCharacterOffset(5);
// System.out.println("token id "+tid);
// System.out.println("token: "+ta.getToken(tid));
// IntPair offs = ta.getTokenCharacterOffset(tid);
// System.out.println("start: "+offs.getFirst());
// System.out.println("edn: "+offs.getSecond());
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class ChineseTokenizer method getTextAnnotation1.
public TextAnnotation getTextAnnotation1(String text) {
if (text.trim().isEmpty())
return null;
text = trad2simp(text);
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
String[] lines = text.split("\n");
int idx = 0;
for (String line : lines) {
if (line.trim().isEmpty())
String[] sentences = line.split("。");
for (int i = 0; i < sentences.length; i++) {
String sentence = sentences[i];
if (sentence.trim().isEmpty())
List<String> segs = segmenter.segmentString(sentence);
for (String seg : segs) {
idx = text.indexOf(seg, idx);
if (!containsHanScript(seg)) {
offsets.add(new IntPair(idx, idx + seg.length()));
} else {
for (int j = 0; j < seg.length(); j++) {
String ch = seg.substring(j, j + 1);
offsets.add(new IntPair(idx + j, idx + j + 1));
idx += seg.length();
if (i < sentences.length - 1) {
idx = text.indexOf("。", idx);
offsets.add(new IntPair(idx, ++idx));
if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size())
// for(int i = 0; i < surfaces.size(); i++){
// System.out.println(i+" "+surfaces.get(i)+" "+offsets.get(i));
// }
// System.out.println(sen_ends);
// System.out.println(surfaces.size()+" "+offsets.size()+" "+sen_ends.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
if (surfs.length == 0)
return null;
TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
return ta;
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class ChineseTokenizer method oldGetTextAnnotation.
public TextAnnotation oldGetTextAnnotation(String text) {
if (text.trim().isEmpty())
return null;
// text = trad2simp(text);
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
String[] lines = text.split("。");
int idx = 0;
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
List<String> segs = segmenter.segmentString(line);
for (String seg : segs) {
if (seg.length() > 1 && seg.endsWith("人")) {
surfaces.add(seg.substring(0, seg.length() - 1));
idx = text.indexOf(seg, idx);
offsets.add(new IntPair(idx, idx + seg.length() - 1));
surfaces.add(seg.substring(seg.length() - 1, seg.length()));
offsets.add(new IntPair(idx + seg.length() - 1, idx + seg.length()));
idx += seg.length();
} else {
idx = text.indexOf(seg, idx);
offsets.add(new IntPair(idx, idx + seg.length()));
idx += seg.length();
if (i < lines.length - 1) {
idx = text.indexOf("。", idx);
offsets.add(new IntPair(idx, ++idx));
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
if (surfs.length == 0)
return null;
TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
return ta;
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class MultilingualTokenizeTextToColumn method processFile.
* given an input containing plain text, tokenize and write to named output file.
* @param corpus name of corpus
* @param in file to tokenize
* @param out output file for tokenized text
public void processFile(String corpus, File in, String out) throws IOException {
if (!in.exists())
throw new IOException("File '" + in.getAbsolutePath() + "' doesn't exist.");
if (!in.isFile())
throw new IOException("File '" + in.getAbsolutePath() + "' exists but is not a file.");
Scanner scanner = new Scanner(new FileInputStream(in),;
StringBuilder sb = new StringBuilder();
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String str = sb.toString();
TextAnnotation ta = taBldr.createTextAnnotation(corpus, in.getName(), str);
View sents = ta.getView(ViewNames.SENTENCE);"processing file '{}'; input length is {}", in.getAbsolutePath(), str.length());
// System.err.println("processing file '" + in.getAbsolutePath() + "'..." + " input length: " + str.length());
List<Constituent> toks = ta.getView(ViewNames.TOKENS).getConstituents();
// List<String> outputs = new ArrayList<>();
StringBuilder bldr = new StringBuilder();
for (Constituent sent : sents) {
int index = 1;
for (Constituent tok : toks) {
if (tok.getStartCharOffset() >= sent.getStartCharOffset() && tok.getEndCharOffset() <= sent.getEndCharOffset()) {
// empty line to separate sentences
System.err.println("output length: " + bldr.toString().length());
// LineIO.write(out, outputs);
try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(out)), {
} catch (IOException e) {
logger.error("Can't write to file {}: {}", out, e.getMessage());
throw e;
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class StanfordAnalyzer method getTextAnnotation.
public TextAnnotation getTextAnnotation(String text) {
Annotation document = new Annotation(text);
List<CoreLabel> tokens = new ArrayList<>();
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
int[] sen_ends = new int[sentences.size()];
int sen_idx = 0;
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
sen_ends[sen_idx++] = tokens.size();
String[] surfaces = new String[tokens.size()];
IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
surfaces[i] = tokens.get(i).originalText();
tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
// System.out.println(surfaces[i]);
// System.out.println(tokenCharOffsets[i]);
// System.out.println(sen_ends[0]);
TextAnnotation ta = new TextAnnotation("", "", text, tokenCharOffsets, surfaces, sen_ends);
return ta;