use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class IndexAndTest method test.
@Test
public void test() throws Exception {
DicLibrary.put(DicLibrary.DEFAULT, "../../library/default.dic");
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
Directory directory = null;
IndexWriter iwriter = null;
IndexWriterConfig ic = new IndexWriterConfig(analyzer);
String text = "旅游和服务是最好的";
System.out.println(IndexAnalysis.parse(text));
// 建立内存索引对象
directory = new RAMDirectory();
iwriter = new IndexWriter(directory, ic);
addContent(iwriter, text);
iwriter.commit();
iwriter.close();
System.out.println("索引建立完毕");
Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);
System.out.println("index ok to search!");
for (Term t : IndexAnalysis.parse(text)) {
System.out.println(t.getName());
search(queryAnalyzer, directory, "\"" + t.getName() + "\"");
}
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class KeyWordComputer method computeArticleTfidf.
/**
* @param content 正文
* @return
*/
private List<Keyword> computeArticleTfidf(String content, int titleLength) {
Map<String, Keyword> tm = new HashMap<String, Keyword>();
List<Term> parse = analysisType.parseStr(content).getTerms();
for (Term term : parse) {
double weight = getWeight(term, content.length(), titleLength);
if (weight == 0)
continue;
Keyword keyword = tm.get(term.getName());
if (keyword == null) {
keyword = new Keyword(term.getName(), term.natrue().allFrequency, weight);
tm.put(term.getName(), keyword);
} else {
keyword.updateWeight(1);
}
}
TreeSet<Keyword> treeSet = new TreeSet<Keyword>(tm.values());
ArrayList<Keyword> arrayList = new ArrayList<Keyword>(treeSet);
if (treeSet.size() <= nKeyword) {
return arrayList;
} else {
return arrayList.subList(0, nKeyword);
}
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class SummaryComputer method toSummary.
/**
* 根据用户查询串计算摘要
*
* @return
*/
public Summary toSummary(String query) {
List<Term> parse = NlpAnalysis.parse(query).getTerms();
List<Keyword> keywords = new ArrayList<Keyword>();
for (Term term : parse) {
if (FILTER_SET.contains(term.natrue().natureStr)) {
continue;
}
keywords.add(new Keyword(term.getName(), term.termNatures().allFreq, 1));
}
return toSummary(keywords);
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class DicLibraryTest method delete.
/**
* 删除关键词
*/
@Test
public void delete() {
insertTest();
DicLibrary.delete(DicLibrary.DEFAULT, "增加新词");
Result parse = DicAnalysis.parse("这是用户自定义词典增加新词的例子");
System.out.println(parse);
boolean flag = false;
for (Term term : parse) {
flag = flag || "增加新词".equals(term.getName());
}
Assert.assertFalse(flag);
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class FilterRecognitionTest method test.
@Test
public void test() {
String str = "我的小鸡鸡丢了!";
Result parse = ToAnalysis.parse(str);
System.out.println(parse);
StopRecognition fitler = new StopRecognition();
fitler.insertStopNatures("uj");
fitler.insertStopNatures("ul");
fitler.insertStopNatures("null");
fitler.insertStopWords("我");
fitler.insertStopRegexes("小.*?");
Result modifResult = parse.recognition(fitler);
for (Term term : modifResult) {
Assert.assertNotSame(term.getNatureStr(), "uj");
Assert.assertNotSame(term.getNatureStr(), "ul");
Assert.assertNotSame(term.getNatureStr(), "null");
Assert.assertNotSame(term.getName(), "我");
Assert.assertNotSame(term.getName(), "小鸡鸡");
}
System.out.println(modifResult);
}
Aggregations