Search in sources :

Example 21 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class SynonymsRecgnitionTest method test.

@Test
public void test() {
    //使用默认的同义词词典
    SynonymsRecgnition synonymsRecgnition = new SynonymsRecgnition();
    String str = "我国中国就是华夏,也是天朝";
    for (Term term : ToAnalysis.parse("我国中国就是华夏")) {
        System.out.println(term.getName() + "\t" + (term.getSynonyms()));
    }
    System.out.println("-------------init library------------------");
    for (Term term : ToAnalysis.parse(str).recognition(synonymsRecgnition)) {
        System.out.println(term.getName() + "\t" + (term.getSynonyms()));
    }
    System.out.println("---------------insert----------------");
    SynonymsLibrary.insert(SynonymsLibrary.DEFAULT, new String[] { "中国", "我国" });
    for (Term term : ToAnalysis.parse(str).recognition(synonymsRecgnition)) {
        System.out.println(term.getName() + "\t" + (term.getSynonyms()));
    }
    System.out.println("---------------append----------------");
    SynonymsLibrary.append(SynonymsLibrary.DEFAULT, new String[] { "中国", "华夏", "天朝" });
    for (Term term : ToAnalysis.parse(str).recognition(synonymsRecgnition)) {
        System.out.println(term.getName() + "\t" + (term.getSynonyms()));
    }
    System.out.println("---------------remove----------------");
    SynonymsLibrary.remove(SynonymsLibrary.DEFAULT, "我国");
    for (Term term : ToAnalysis.parse(str).recognition(synonymsRecgnition)) {
        System.out.println(term.getName() + "\t" + (term.getSynonyms()));
    }
}
Also used : Term(org.ansj.domain.Term) Test(org.junit.Test)

Example 22 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class DicAnalysisTest method test1.

@Test
public void test1() {
    DicLibrary.insert(DicLibrary.DEFAULT, "金水区", "ad", 1000);
    DicLibrary.insert(DicLibrary.DEFAULT, "渝北区", "ad", 1000);
    DicLibrary.insert(DicLibrary.DEFAULT, "金童路", "ad", 1000);
    DicLibrary.insert(DicLibrary.DEFAULT, "奥山", "ad", 1000);
    DicLibrary.insert(DicLibrary.DEFAULT, "来自大", "ab", 1000);
    DicLibrary.insert(DicLibrary.DEFAULT, "自大学", "ab", 2000);
    DicLibrary.insert(DicLibrary.DEFAULT, "网大学", "ab", 1000);
    System.out.println(DicAnalysis.parse("重庆重庆市渝北区金童路奥山别墅162"));
    System.out.println(DicAnalysis.parse("河南省郑州市金水区金水区农科路与文博西路交叉口向东200米路南"));
    System.out.println(DicAnalysis.parse("来自大学生小说网大学"));
    String newWord = "爸爸去哪儿";
    String nature = "aaaaa";
    String str = "上海电力2012年财务报表如下怎爸爸去哪儿么办";
    //增加新词
    DicLibrary.insert(DicLibrary.DEFAULT, newWord, nature, 1000);
    DicLibrary.insert(DicLibrary.DEFAULT, "上海电力", nature, 1000);
    List<Term> parse = DicAnalysis.parse(str).getTerms();
    HashMap<String, Term> hs = new HashMap<String, Term>();
    for (Term term : parse) {
        hs.put(term.getName(), term);
    }
    Assert.assertTrue(hs.containsKey(newWord));
    Assert.assertEquals(hs.get(newWord).natrue().natureStr, nature);
    Library.insertWord(DicLibrary.get(), new Value("北京卡", "UserDefined", "1000"));
    Assert.assertEquals(DicAnalysis.parse("北京卡机场服务").get(0).getName(), "北京卡");
    //删除词
    DicLibrary.delete(DicLibrary.DEFAULT, newWord);
    parse = DicAnalysis.parse(str).getTerms();
    hs = new HashMap<String, Term>();
    for (Term term : parse) {
        hs.put(term.getName(), term);
    }
    Assert.assertTrue(!hs.containsKey(newWord));
}
Also used : HashMap(java.util.HashMap) Value(org.nlpcn.commons.lang.tire.domain.Value) Term(org.ansj.domain.Term) CorpusTest(org.ansj.CorpusTest) Test(org.junit.Test)

Example 23 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class SpeedTest method main.

public static void main(String[] args) throws IOException {
    ToAnalysis.parse("test---aaaa中国孙健测试");
    BufferedReader reader = IOUtil.getReader("/home/ansj/data/allSportsArticle", IOUtil.UTF8);
    long start = System.currentTimeMillis();
    long allCount = 0;
    //		for (int j = 0; j < 1; j++) {
    //			for (String string : all) {
    //				allCount += string.length();
    //				ToAnalysis.parse(string);
    //			}
    //		}
    //		String temp = null ;
    //		while((temp=reader.readLine())!=null){
    //			GetWordsImpl gwi = new GetWordsImpl(temp) ;
    //			allCount += temp.length() ;
    //			while((gwi.allWords())!=null){
    //				
    //			}
    //		}
    ToAnalysis toAnalysis = new ToAnalysis(IOUtil.getReader("/home/ansj/data/allSportsArticle", IOUtil.UTF8));
    Term term = null;
    while ((term = toAnalysis.next()) != null) {
        allCount += term.getName().length();
    }
    long end = System.currentTimeMillis();
    System.out.println(start - end);
    System.out.println("共 " + allCount + " 个字符,每秒处理了:" + (allCount * 1000 / (end - start)));
}
Also used : BufferedReader(java.io.BufferedReader) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Term(org.ansj.domain.Term)

Example 24 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class StopRecognition method recognition.

@Override
public void recognition(Result result) {
    List<Term> list = result.getTerms();
    Iterator<Term> iterator = list.iterator();
    while (iterator.hasNext()) {
        Term term = iterator.next();
        if (filter(term)) {
            iterator.remove();
        }
    }
}
Also used : Term(org.ansj.domain.Term)

Example 25 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class Analysis method analysisStr.

/**
	 * 一整句话分词,用户设置的歧异优先
	 * 
	 * @param temp
	 * @return
	 */
private List<Term> analysisStr(String temp) {
    Graph gp = new Graph(temp);
    int startOffe = 0;
    if (this.ambiguityForest != null) {
        GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
        String[] params = null;
        while ((gw.getFrontWords()) != null) {
            if (gw.offe > startOffe) {
                analysis(gp, startOffe, gw.offe);
            }
            params = gw.getParams();
            startOffe = gw.offe;
            for (int i = 0; i < params.length; i += 2) {
                gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
                startOffe += params[i].length();
            }
        }
    }
    if (startOffe < gp.chars.length) {
        analysis(gp, startOffe, gp.chars.length);
    }
    List<Term> result = this.getResult(gp);
    return result;
}
Also used : Graph(org.ansj.util.Graph) TermNatures(org.ansj.domain.TermNatures) Term(org.ansj.domain.Term) TermNature(org.ansj.domain.TermNature) GetWord(org.nlpcn.commons.lang.tire.GetWord)

Aggregations

Term (org.ansj.domain.Term)55 ArrayList (java.util.ArrayList)10 Result (org.ansj.domain.Result)8 Test (org.junit.Test)8 TermNatures (org.ansj.domain.TermNatures)5 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)4 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)4 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)4 Graph (org.ansj.util.Graph)4 Forest (org.nlpcn.commons.lang.tire.domain.Forest)4 LinkedList (java.util.LinkedList)3 NewWord (org.ansj.domain.NewWord)3 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)3 NatureRecognition (org.ansj.recognition.impl.NatureRecognition)3 GetWord (org.nlpcn.commons.lang.tire.GetWord)3 BufferedReader (java.io.BufferedReader)2 HashMap (java.util.HashMap)2 TermNature (org.ansj.domain.TermNature)2 ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)2 Analyzer (org.apache.lucene.analysis.Analyzer)2