use of org.ansj.splitWord.analysis.NlpAnalysis in project ansj_seg by NLPchina.
the class AppTest method main.
public static void main(String[] args) throws IOException {
String stopDicStr = "6\n7\n龙";
StopRecognition testFilter = new StopRecognition();
BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
String temp = null;
while ((temp = br.readLine()) != null) {
testFilter.insertStopWords(temp);
}
List<StopRecognition> filters = new ArrayList<StopRecognition>();
filters.add(testFilter);
for (int i = 0; i < 1; i++) {
StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
parse(new IndexAnalysis(reader), filters);
parse(new ToAnalysis(reader), filters);
parse(new DicAnalysis(reader), filters);
parse(new NlpAnalysis(reader), filters);
parse(new BaseAnalysis(reader), filters);
}
}
use of org.ansj.splitWord.analysis.NlpAnalysis in project ansj_seg by NLPchina.
the class AnsjAnalyzer method getTokenizer.
/**
* 获得一个tokenizer
*
* @param reader
* @param type
* @param filter
* @return
*/
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
if (LOG.isDebugEnabled()) {
LOG.debug("to create tokenizer " + args);
}
Analysis analysis = null;
String temp = null;
String type = args.get("type");
if (type == null) {
type = AnsjAnalyzer.TYPE.base_ansj.name();
}
switch(AnsjAnalyzer.TYPE.valueOf(type)) {
case base_ansj:
analysis = new BaseAnalysis();
break;
case index_ansj:
analysis = new IndexAnalysis();
break;
case dic_ansj:
analysis = new DicAnalysis();
break;
case query_ansj:
analysis = new ToAnalysis();
break;
case nlp_ansj:
analysis = new NlpAnalysis();
if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
}
break;
default:
analysis = new BaseAnalysis();
}
if (reader != null) {
analysis.resetContent(reader);
}
if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
Forest[] forests = new Forest[split.length];
for (int i = 0; i < forests.length; i++) {
if (StringUtil.isBlank(split[i])) {
continue;
}
forests[i] = DicLibrary.get(split[i]);
}
analysis.setForests(forests);
}
List<StopRecognition> filters = null;
if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
filters = new ArrayList<StopRecognition>();
for (String key : split) {
StopRecognition stop = StopLibrary.get(key.trim());
if (stop != null)
filters.add(stop);
}
}
List<SynonymsRecgnition> synonyms = null;
if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
//同义词词典
String[] split = temp.split(",");
synonyms = new ArrayList<SynonymsRecgnition>();
for (String key : split) {
SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
if (sf != null)
synonyms.add(new SynonymsRecgnition(sf));
}
}
if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
//歧义词典
analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
}
if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
// 是否开启人名识别
analysis.setIsNameRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
// 是否开启数字识别
analysis.setIsNumRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
//量词识别
analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
//是否保留原字符
analysis.setIsRealName(Boolean.valueOf(temp));
}
return new AnsjTokenizer(analysis, filters, synonyms);
}
use of org.ansj.splitWord.analysis.NlpAnalysis in project ansj_seg by NLPchina.
the class AnsjAnalyzer method getTokenizer.
/**
* 获得一个tokenizer
*
* @param reader
* @param type
* @param filter
* @return
*/
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
if (LOG.isDebugEnabled()) {
LOG.debug("to create tokenizer " + args);
}
Analysis analysis = null;
String temp = null;
String type = args.get("type");
if (type == null) {
type = AnsjAnalyzer.TYPE.base_ansj.name();
}
switch(AnsjAnalyzer.TYPE.valueOf(type)) {
case base_ansj:
analysis = new BaseAnalysis();
break;
case index_ansj:
analysis = new IndexAnalysis();
break;
case dic_ansj:
analysis = new DicAnalysis();
break;
case query_ansj:
analysis = new ToAnalysis();
break;
case nlp_ansj:
analysis = new NlpAnalysis();
if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
}
break;
default:
analysis = new BaseAnalysis();
}
if (reader != null) {
analysis.resetContent(reader);
}
if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
Forest[] forests = new Forest[split.length];
for (int i = 0; i < forests.length; i++) {
if (StringUtil.isBlank(split[i])) {
continue;
}
forests[i] = DicLibrary.get(split[i]);
}
analysis.setForests(forests);
}
List<StopRecognition> filters = null;
if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
filters = new ArrayList<StopRecognition>();
for (String key : split) {
StopRecognition stop = StopLibrary.get(key.trim());
if (stop != null)
filters.add(stop);
}
}
List<SynonymsRecgnition> synonyms = null;
if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
//同义词词典
String[] split = temp.split(",");
synonyms = new ArrayList<SynonymsRecgnition>();
for (String key : split) {
SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
if (sf != null)
synonyms.add(new SynonymsRecgnition(sf));
}
}
if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
//歧义词典
analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
}
if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
// 是否开启人名识别
analysis.setIsNameRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
// 是否开启数字识别
analysis.setIsNumRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
//量词识别
analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
//是否保留原字符
analysis.setIsRealName(Boolean.valueOf(temp));
}
return new AnsjTokenizer(analysis, filters, synonyms);
}
use of org.ansj.splitWord.analysis.NlpAnalysis in project ansj_seg by NLPchina.
the class KeyWordComputerTest method test.
@Test
public void test() {
String content = "北京英富森软件股份有限公司是在北京市海淀区注册的高新技术企业、双软企业。“信息中国”(information china简称“infcn” )是“英富森”的核心目标与战略。英富森公司的成立依托于凌云实验室的部分成果和理念,主要以信息管理与信息服务、知识管理与知识服务为基本方向,侧重于信息的整合、组织、发现和利用。通过先进的信息技术和服务理念,帮助行业客户建立企业级信息服务与知识服务平台,实现客户的企业级信息与知识的应用与发现。 公司来源于信息行业,依托于高校和科研院所,服务于行业客户。英富森凝聚了一支专业、高效、快乐、融洽的优秀团队,锻造出了一支服务型、管理型、创新型与开拓型的团队。";
KeyWordComputer kwc = new KeyWordComputer(3, new NlpAnalysis());
System.out.println(kwc.computeArticleTfidf(content));
}
use of org.ansj.splitWord.analysis.NlpAnalysis in project ansj_seg by NLPchina.
the class NewWordFindDemo method main.
public static void main(String[] args) throws IOException {
BufferedReader reader = IOUtil.getReader("/Users/ansj/Downloads/三国演义.txt", "GBK");
LearnTool learn = new LearnTool();
NlpAnalysis nlpAnalysis = new NlpAnalysis(reader).setLearnTool(learn);
while (nlpAnalysis.next() != null) {
}
List<Entry<String, Double>> topTree = learn.getTopTree(0);
StringBuilder sb = new StringBuilder();
for (Entry<String, Double> entry : topTree) {
sb.append(entry.getKey() + "\t" + entry.getValue() + "\n");
}
IOUtil.Writer("/Users/ansj/Desktop/result.txt", IOUtil.UTF8, sb.toString());
}
Aggregations