Search in sources :

Example 26 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class Analysis method analysis.

private void analysis(Graph gp, int startOffe, int endOffe) {
    int start = 0;
    int end = 0;
    char[] chars = gp.chars;
    String str = null;
    for (int i = startOffe; i < endOffe; i++) {
        switch(status(chars[i])) {
            case 4:
                start = i;
                end = 1;
                while (++i < endOffe && status(chars[i]) == 4) {
                    end++;
                }
                str = WordAlert.alertEnglish(chars, start, end);
                gp.addTerm(new Term(str, start, TermNatures.EN));
                i--;
                break;
            case 5:
                start = i;
                end = 1;
                while (++i < endOffe && status(chars[i]) == 5) {
                    end++;
                }
                str = WordAlert.alertNumber(chars, start, end);
                gp.addTerm(new Term(str, start, TermNatures.M));
                i--;
                break;
            default:
                start = i;
                end = i;
                int status = 0;
                do {
                    end = ++i;
                    if (i >= endOffe) {
                        break;
                    }
                    status = status(chars[i]);
                } while (status < 4);
                if (status > 3) {
                    i--;
                }
                gwi.setChars(chars, start, end);
                int max = start;
                while ((str = gwi.allWords()) != null) {
                    Term term = new Term(str, gwi.offe, gwi.getItem());
                    int len = term.getOffe() - max;
                    if (len > 0) {
                        for (; max < term.getOffe(); ) {
                            gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
                            max++;
                        }
                    }
                    gp.addTerm(term);
                    max = term.toValue();
                }
                int len = end - max;
                if (len > 0) {
                    for (; max < end; ) {
                        gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
                        max++;
                    }
                }
                break;
        }
    }
}
Also used : Term(org.ansj.domain.Term)

Example 27 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class Analysis method parse.

/**
	 * 通过构造方法传入的reader直接获取到分词结果
	 * 
	 * @return
	 * @throws IOException
	 */
public Result parse() throws IOException {
    List<Term> list = new ArrayList<Term>();
    Term temp = null;
    while ((temp = next()) != null) {
        list.add(temp);
    }
    Result result = new Result(list);
    return result;
}
Also used : ArrayList(java.util.ArrayList) Term(org.ansj.domain.Term) Result(org.ansj.domain.Result)

Example 28 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class Graph method merger.

/**
	 * 具体的遍历打分方法
	 * 
	 * @param i 起始位置
	 * @param j 起始属性
	 * @param to
	 */
private void merger(Term fromTerm, int to, Map<String, Double> relationMap) {
    Term term = null;
    if (terms[to] != null) {
        term = terms[to];
        while (term != null) {
            // 关系式to.set(from)
            term.setPathScore(fromTerm, relationMap);
            term = term.next();
        }
    } else {
        char c = chars[to];
        TermNatures tn = DATDictionary.getItem(c).termNatures;
        if (tn == null || tn == TermNatures.NULL) {
            tn = TermNatures.NULL;
        }
        terms[to] = new Term(String.valueOf(c), to, tn);
        terms[to].setPathScore(fromTerm, relationMap);
    }
}
Also used : TermNatures(org.ansj.domain.TermNatures) Term(org.ansj.domain.Term)

Example 29 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class AnsjTokenizer method incrementToken.

@Override
public final boolean incrementToken() throws IOException {
    if (result == null) {
        parse();
    }
    Object obj = result.pollFirst();
    if (obj == null) {
        result = null;
        return false;
    }
    if (obj instanceof Term) {
        clearAttributes();
        Term term = (Term) obj;
        while (filterTerm(term)) {
            //停用词
            term = (Term) result.pollFirst();
            if (term == null) {
                result = null;
                return false;
            }
            position++;
        }
        position++;
        //获得同义词
        List<String> synonyms = term.getSynonyms();
        String rName = null;
        if (synonyms != null) {
            for (int i = 1; i < synonyms.size(); i++) {
                result.addFirst(synonyms.get(i));
            }
            rName = synonyms.get(0);
        } else {
            rName = term.getName();
        }
        offsetAtt.setOffset(term.getOffe(), term.getOffe() + term.getName().length());
        typeAtt.setType(term.getNatureStr());
        positionAttr.setPositionIncrement(position);
        termAtt.setEmpty().append(rName);
    } else {
        positionAttr.setPositionIncrement(position);
        termAtt.setEmpty().append(obj.toString());
    }
    return true;
}
Also used : Term(org.ansj.domain.Term)

Example 30 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class IndexAndTest method test.

@Test
public void test() throws Exception {
    DicLibrary.put(DicLibrary.DEFAULT, "../../library/default.dic");
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
    Directory directory = null;
    IndexWriter iwriter = null;
    IndexWriterConfig ic = new IndexWriterConfig(analyzer);
    String text = "旅游和服务是最好的";
    System.out.println(IndexAnalysis.parse(text));
    // 建立内存索引对象
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    addContent(iwriter, text);
    iwriter.commit();
    iwriter.close();
    System.out.println("索引建立完毕");
    Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);
    System.out.println("index ok to search!");
    for (Term t : IndexAnalysis.parse(text)) {
        System.out.println(t.getName());
        search(queryAnalyzer, directory, "\"" + t.getName() + "\"");
    }
}
Also used : AnsjAnalyzer(org.ansj.lucene5.AnsjAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) Term(org.ansj.domain.Term) AnsjAnalyzer(org.ansj.lucene5.AnsjAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Aggregations

Term (org.ansj.domain.Term)55 ArrayList (java.util.ArrayList)10 Result (org.ansj.domain.Result)8 Test (org.junit.Test)8 TermNatures (org.ansj.domain.TermNatures)5 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)4 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)4 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)4 Graph (org.ansj.util.Graph)4 Forest (org.nlpcn.commons.lang.tire.domain.Forest)4 LinkedList (java.util.LinkedList)3 NewWord (org.ansj.domain.NewWord)3 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)3 NatureRecognition (org.ansj.recognition.impl.NatureRecognition)3 GetWord (org.nlpcn.commons.lang.tire.GetWord)3 BufferedReader (java.io.BufferedReader)2 HashMap (java.util.HashMap)2 TermNature (org.ansj.domain.TermNature)2 ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)2 Analyzer (org.apache.lucene.analysis.Analyzer)2