use of org.nlpcn.commons.lang.util.MapCount in project ansj_seg by NLPchina.
the class Model method writeModel.
/**
* 将model序列化到硬盘
*
* @param path
* @throws IOException
* @throws FileNotFoundException
*/
public void writeModel(String path) {
try (FileOutputStream fso = new FileOutputStream(path)) {
ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(fso));
oos.writeUTF(CRFModel.version);
oos.writeObject(status);
oos.writeObject(config.getTemplate());
Map<String, float[]> map = featureTree.toMap();
MapCount<Integer> mc = new MapCount<Integer>();
for (float[] v : map.values()) {
mc.add(v.length);
}
for (Entry<Integer, Double> entry : mc.get().entrySet()) {
int win = entry.getKey();
// 宽度
oos.writeInt(win);
// 个数
oos.writeInt(entry.getValue().intValue());
for (Entry<String, float[]> e : map.entrySet()) {
if (e.getValue().length == win) {
oos.writeUTF(e.getKey());
float[] value = e.getValue();
for (int i = 0; i < win; i++) {
oos.writeFloat(value[i]);
}
}
}
}
oos.writeInt(0);
oos.writeInt(0);
oos.flush();
} catch (FileNotFoundException e) {
logger.warn("文件没有找到", e);
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
use of org.nlpcn.commons.lang.util.MapCount in project ansj_seg by NLPchina.
the class NlpAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
if (learn == null) {
learn = new LearnTool();
}
graph.walkPath();
learn.learn(graph, splitWord, forests);
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
if (splitWord != null) {
MapCount<String> mc = new MapCount<String>();
// 通过crf分词
List<String> words = splitWord.cut(graph.chars);
Term tempTerm = null;
int tempOff = 0;
if (words.size() > 0) {
String word = words.get(0);
if (!isRuleWord(word)) {
mc.add("始##始" + TAB + word, CRF_WEIGHT);
}
}
for (String word : words) {
// 尝试从词典获取词性
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
Term term = null;
if (termNatures != TermNatures.NULL) {
term = new Term(word, tempOff, termNatures);
} else {
term = new Term(word, tempOff, TermNatures.NW);
term.setNewWord(true);
}
// 增加偏移量
tempOff += word.length();
if (isRuleWord(word)) {
// 如果word不对那么不要了
tempTerm = null;
continue;
}
if (term.isNewWord()) {
// 尝试猜测词性
termNatures = NatureRecognition.guessNature(word);
term.updateTermNaturesAndNature(termNatures);
}
TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
// 对于非词典中的词持有保守态度
if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
}
tempTerm = term;
if (term.isNewWord()) {
learn.addTerm(new NewWord(word, Nature.NW));
}
}
if (tempTerm != null && !tempTerm.isNewWord()) {
mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
}
graph.walkPath(mc.get());
} else {
LOG.warn("not find any crf model, make sure your config right? ");
}
// 数字发现
if (graph.hasNum && isNumRecognition) {
new NumRecognition().recognition(graph.terms);
}
// 词性标注
List<Term> result = getResult();
// 用户自定义词典的识别
new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
// 进行新词发现
new NewWordRecognition(learn).recognition(graph.terms);
graph.walkPathByScore();
// 优化后重新获得最优路径
result = getResult();
// 激活辞典
for (Term term : result) {
learn.active(term.getName());
}
setRealName(graph, result);
return result;
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] == null) {
continue;
}
result.add(graph.terms[i]);
}
return result;
}
};
return merger.merger();
}
use of org.nlpcn.commons.lang.util.MapCount in project ansj_seg by NLPchina.
the class SummaryComputer method explan.
/**
* 计算摘要
*
* @param keyword
* @param content
* @return
*/
private Summary explan(List<Keyword> keywords, String content) {
SmartForest<Double> sf = new SmartForest<Double>();
for (Keyword keyword : keywords) {
sf.add(keyword.getName(), keyword.getScore());
}
// 先断句
List<Sentence> sentences = toSentenceList(content.toCharArray());
for (Sentence sentence : sentences) {
computeScore(sentence, sf);
}
double maxScore = 0;
int maxIndex = 0;
MapCount<String> mc = new MapCount<>();
for (int i = 0; i < sentences.size(); i++) {
double tempScore = sentences.get(i).score;
int tempLength = sentences.get(i).value.length();
mc.addAll(sentences.get(i).mc.get());
if (tempLength >= len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
continue;
}
mc.get().clear();
}
for (int j = i + 1; j < sentences.size(); j++) {
tempScore += sentences.get(j).score;
tempLength += sentences.get(j).value.length();
mc.addAll(sentences.get(j).mc.get());
if (tempLength >= len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
}
mc.get().clear();
break;
}
}
if (tempLength < len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
break;
}
mc.get().clear();
}
}
StringBuilder sb = new StringBuilder();
for (int i = maxIndex; i < sentences.size(); i++) {
sb.append(sentences.get(i).value);
if (sb.length() > len) {
break;
}
}
String summaryStr = sb.toString();
if (isSplitSummary && sb.length() > len) {
double value = len;
StringBuilder newSummary = new StringBuilder();
char c = 0;
for (int i = 0; i < sb.length(); i++) {
c = sb.charAt(i);
if (c < 256) {
value -= 0.5;
} else {
value -= 1;
}
if (value < 0) {
break;
}
newSummary.append(c);
}
summaryStr = newSummary.toString();
}
return new Summary(keywords, summaryStr);
}
Aggregations