use of io.openk9.index.writer.entity.model.DocumentEntityResponse in project openk9 by smclab.
the class GetOrAddEntities method cleanCandidates.
private List<DocumentEntityResponse> cleanCandidates(EntityRequest entityRequest, List<DocumentEntityResponse> candidates) {
if (_log.isDebugEnabled()) {
_log.debug("entity " + entityRequest.getName() + " candidates: " + candidates);
}
if (!candidates.isEmpty()) {
DocumentEntityResponse documentEntityResponse = candidates.get(0);
double bestScore;
if (candidates.size() > 1) {
if (_log.isDebugEnabled()) {
_log.debug("softmax");
}
double[] scores = candidates.stream().mapToDouble(DocumentEntityResponse::getScore).toArray();
bestScore = _softmax(documentEntityResponse.getScore(), scores);
} else {
if (_log.isDebugEnabled()) {
_log.debug("levenshtein");
}
bestScore = _levenshteinDistance(_entityNameCleanerProvider.getEntityNameCleaner(documentEntityResponse.getType()).cleanEntityName(documentEntityResponse.getName()), _entityNameCleanerProvider.getEntityNameCleaner(entityRequest.getType()).cleanEntityName(entityRequest.getName()));
}
if (_log.isDebugEnabled()) {
_log.debug("current score: " + bestScore + " score threshold: " + _scoreThreshold + " for entity " + entityRequest.getName());
}
if (bestScore > _scoreThreshold) {
_log.debug("filtered with treshold");
return Collections.singletonList(documentEntityResponse);
}
}
if (candidates.isEmpty() && _log.isDebugEnabled()) {
_log.debug("candidates empty");
}
return candidates;
}
use of io.openk9.index.writer.entity.model.DocumentEntityResponse in project openk9 by smclab.
the class CreateEntitiesRunnable method cleanCandidates.
private List<EntityIndex> cleanCandidates(Entity entityRequest, List<EntityIndex> candidates, EntityNameCleanerProvider entityNameCleanerProvider, float scoreThreshold) {
if (!candidates.isEmpty()) {
EntityIndex documentEntityResponse = candidates.get(0);
double bestScore;
if (candidates.size() > 1) {
double[] scores = candidates.stream().mapToDouble(EntityIndex::getScore).toArray();
bestScore = _softmax(documentEntityResponse.getScore(), scores);
} else {
bestScore = _levenshteinDistance(entityNameCleanerProvider.get(documentEntityResponse.getType()).cleanEntityName(documentEntityResponse.getName()), entityNameCleanerProvider.get(entityRequest.getType()).cleanEntityName(entityRequest.getName()));
}
if (bestScore > scoreThreshold) {
return Collections.singletonList(documentEntityResponse);
}
}
return candidates;
}
use of io.openk9.index.writer.entity.model.DocumentEntityResponse in project openk9 by smclab.
the class GetOrAddEntities method _disambiguate.
private Mono<Entity> _disambiguate(List<DocumentEntityResponse> candidates, List<DocumentEntityResponse> entityRequestList, long tenantId, EntityRequest currentEntityRequest) {
Flux<Entity> entityFlux = Flux.empty();
String currentEntityRequestType = currentEntityRequest.getType();
if (_log.isDebugEnabled()) {
_log.debug(Arrays.toString(_uniqueEntities));
}
if (!candidates.isEmpty() && !_containsValue(_uniqueEntities, currentEntityRequestType)) {
if (_log.isDebugEnabled()) {
_log.debug("disambiguating with search entity with type " + currentEntityRequestType);
}
Statement[] statements = new Statement[entityRequestList.size()];
for (int i = 0; i < entityRequestList.size(); i++) {
DocumentEntityResponse entityRequest = entityRequestList.get(i);
Node nodeEntity = Cypher.node(entityRequest.getType()).named(ENTITY);
AliasedExpression entityAliased = nodeEntity.as(ENTITY);
SymbolicName path = Cypher.name(PATH);
Statement statement = Cypher.match(nodeEntity).where(Functions.id(nodeEntity).eq(literalOf(entityRequest.getId()))).call(APOC_PATH_EXPAND).withArgs(entityAliased.getDelegate(), literalOf(null), literalOf(_labelFilter), literalOf(_minHops), literalOf(_maxHops)).yield(path).returning(Functions.last(Functions.nodes(path)).as(NODE), Functions.size(Functions.nodes(path)).subtract(literalOf(1)).as(HOPS)).build();
statements[i] = statement;
}
if (statements.length == 1) {
Statement entityRequestListStatement = Cypher.call(statements[0]).returning(NODE, HOPS).orderBy(Cypher.name(HOPS)).build();
entityFlux = _entityGraphRepository.getEntities(entityRequestListStatement);
} else if (statements.length > 1) {
Statement entityRequestListStatement = Cypher.call(Cypher.unionAll(statements)).returning(NODE, HOPS).orderBy(Cypher.name(HOPS)).build();
entityFlux = _entityGraphRepository.getEntities(entityRequestListStatement);
}
}
if (candidates.size() == 1 && _containsValue(_uniqueEntities, currentEntityRequestType)) {
if (_log.isDebugEnabled()) {
_log.debug("disambiguating entity with type " + currentEntityRequestType);
}
DocumentEntityResponse candidate = candidates.get(0);
entityFlux = _entityGraphRepository.getEntity(candidate.getId()).flux();
}
return entityFlux.filter(entity -> candidates.stream().anyMatch(entity1 -> entity1.getId() == entity.getId())).next().switchIfEmpty(Mono.defer(() -> _entityGraphRepository.addEntity(tenantId, currentEntityRequest.getName(), currentEntityRequest.getType()).flatMap(entity -> {
if (_containsValue(_notIndexEntities, entity.getType())) {
return Mono.just(entity);
} else {
return _insertEntity(DocumentEntityRequest.builder().tenantId(entity.getTenantId()).name(entity.getName()).type(entity.getType()).id(entity.getId()).build()).thenReturn(entity);
}
})));
}
Aggregations