use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.
the class AnsjAnalyzer method getTokenizer.
/**
* 获得一个tokenizer
*
* @param reader
* @param type
* @param filter
* @return
*/
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
if (LOG.isDebugEnabled()) {
LOG.debug("to create tokenizer " + args);
}
Analysis analysis = null;
String temp = null;
String type = args.get("type");
if (type == null) {
type = AnsjAnalyzer.TYPE.base_ansj.name();
}
switch(AnsjAnalyzer.TYPE.valueOf(type)) {
case base_ansj:
analysis = new BaseAnalysis();
break;
case index_ansj:
analysis = new IndexAnalysis();
break;
case dic_ansj:
analysis = new DicAnalysis();
break;
case query_ansj:
analysis = new ToAnalysis();
break;
case nlp_ansj:
analysis = new NlpAnalysis();
if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
}
break;
default:
analysis = new BaseAnalysis();
}
if (reader != null) {
analysis.resetContent(reader);
}
if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
Forest[] forests = new Forest[split.length];
for (int i = 0; i < forests.length; i++) {
if (StringUtil.isBlank(split[i])) {
continue;
}
forests[i] = DicLibrary.get(split[i]);
}
analysis.setForests(forests);
}
List<StopRecognition> filters = null;
if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
filters = new ArrayList<StopRecognition>();
for (String key : split) {
StopRecognition stop = StopLibrary.get(key.trim());
if (stop != null)
filters.add(stop);
}
}
List<SynonymsRecgnition> synonyms = null;
if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
//同义词词典
String[] split = temp.split(",");
synonyms = new ArrayList<SynonymsRecgnition>();
for (String key : split) {
SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
if (sf != null)
synonyms.add(new SynonymsRecgnition(sf));
}
}
if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
//歧义词典
analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
}
if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
// 是否开启人名识别
analysis.setIsNameRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
// 是否开启数字识别
analysis.setIsNumRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
//量词识别
analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
//是否保留原字符
analysis.setIsRealName(Boolean.valueOf(temp));
}
return new AnsjTokenizer(analysis, filters, synonyms);
}
use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.
the class TagContent method tagContent.
public String tagContent(List<Keyword> keyWords, String content) {
SmartForest<Double> sf = new SmartForest<Double>();
for (Keyword keyWord : keyWords) {
sf.add(keyWord.getName().toLowerCase(), keyWord.getScore());
}
SmartGetWord<Double> sgw = new SmartGetWord<Double>(sf, content.toLowerCase());
int beginOffe = 0;
String temp = null;
StringBuilder sb = new StringBuilder();
while ((temp = sgw.getFrontWords()) != null) {
sb.append(content.substring(beginOffe, sgw.offe));
sb.append(beginTag);
sb.append(content.substring(sgw.offe, sgw.offe + temp.length()));
sb.append(endTag);
beginOffe = sgw.offe + temp.length();
}
if (beginOffe <= content.length() - 1) {
sb.append(content.substring(beginOffe, content.length()));
}
return sb.toString();
}
use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.
the class AnsjAnalyzer method getTokenizer.
/**
* 获得一个tokenizer
*
* @param reader
* @param type
* @param filter
* @return
*/
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
if (LOG.isDebugEnabled()) {
LOG.debug("to create tokenizer " + args);
}
Analysis analysis = null;
String temp = null;
String type = args.get("type");
if (type == null) {
type = AnsjAnalyzer.TYPE.base_ansj.name();
}
switch(AnsjAnalyzer.TYPE.valueOf(type)) {
case base_ansj:
analysis = new BaseAnalysis();
break;
case index_ansj:
analysis = new IndexAnalysis();
break;
case dic_ansj:
analysis = new DicAnalysis();
break;
case query_ansj:
analysis = new ToAnalysis();
break;
case nlp_ansj:
analysis = new NlpAnalysis();
if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
}
break;
default:
analysis = new BaseAnalysis();
}
if (reader != null) {
analysis.resetContent(reader);
}
if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
Forest[] forests = new Forest[split.length];
for (int i = 0; i < forests.length; i++) {
if (StringUtil.isBlank(split[i])) {
continue;
}
forests[i] = DicLibrary.get(split[i]);
}
analysis.setForests(forests);
}
List<StopRecognition> filters = null;
if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
//用户自定义词典
String[] split = temp.split(",");
filters = new ArrayList<StopRecognition>();
for (String key : split) {
StopRecognition stop = StopLibrary.get(key.trim());
if (stop != null)
filters.add(stop);
}
}
List<SynonymsRecgnition> synonyms = null;
if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
//同义词词典
String[] split = temp.split(",");
synonyms = new ArrayList<SynonymsRecgnition>();
for (String key : split) {
SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
if (sf != null)
synonyms.add(new SynonymsRecgnition(sf));
}
}
if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
//歧义词典
analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
}
if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
// 是否开启人名识别
analysis.setIsNameRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
// 是否开启数字识别
analysis.setIsNumRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
//量词识别
analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
//是否保留原字符
analysis.setIsRealName(Boolean.valueOf(temp));
}
return new AnsjTokenizer(analysis, filters, synonyms);
}
use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.
the class SummaryComputer method explan.
/**
* 计算摘要
*
* @param keyword
* @param content
* @return
*/
private Summary explan(List<Keyword> keywords, String content) {
SmartForest<Double> sf = new SmartForest<Double>();
for (Keyword keyword : keywords) {
sf.add(keyword.getName(), keyword.getScore());
}
// 先断句
List<Sentence> sentences = toSentenceList(content.toCharArray());
for (Sentence sentence : sentences) {
computeScore(sentence, sf);
}
double maxScore = 0;
int maxIndex = 0;
MapCount<String> mc = new MapCount<>();
for (int i = 0; i < sentences.size(); i++) {
double tempScore = sentences.get(i).score;
int tempLength = sentences.get(i).value.length();
mc.addAll(sentences.get(i).mc.get());
if (tempLength >= len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
continue;
}
mc.get().clear();
}
for (int j = i + 1; j < sentences.size(); j++) {
tempScore += sentences.get(j).score;
tempLength += sentences.get(j).value.length();
mc.addAll(sentences.get(j).mc.get());
if (tempLength >= len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
}
mc.get().clear();
break;
}
}
if (tempLength < len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
break;
}
mc.get().clear();
}
}
StringBuilder sb = new StringBuilder();
for (int i = maxIndex; i < sentences.size(); i++) {
sb.append(sentences.get(i).value);
if (sb.length() > len) {
break;
}
}
String summaryStr = sb.toString();
if (isSplitSummary && sb.length() > len) {
double value = len;
StringBuilder newSummary = new StringBuilder();
char c = 0;
for (int i = 0; i < sb.length(); i++) {
c = sb.charAt(i);
if (c < 256) {
value -= 0.5;
} else {
value -= 1;
}
if (value < 0) {
break;
}
newSummary.append(c);
}
summaryStr = newSummary.toString();
}
return new Summary(keywords, summaryStr);
}
Aggregations