use of de.ids_mannheim.korap.util.QueryException in project Krill by KorAP.
the class KrillIndex method getMatchInfo.
/**
* Get a match.
*/
/*
KorapInfo is associated with a Match and has an array with all informations
per position in the match.
*/
public Match getMatchInfo(String idString, String field, boolean info, List<String> foundry, List<String> layer, boolean includeSpans, boolean includeHighlights, boolean extendToSentence) throws QueryException {
if (DEBUG)
log.trace("Get info on {}", idString);
Match match = new Match(idString, includeHighlights);
if (this.getVersion() != null)
match.setVersion(this.getVersion());
if (this.getName() != null)
match.setName(this.getName());
if (match.getStartPos() == -1)
return match;
// Create a filter based on the corpusID and the docID
BooleanQuery bool = new BooleanQuery();
if (match.getTextSigle() != null) {
bool.add(new TermQuery(new Term("textSigle", match.getTextSigle())), BooleanClause.Occur.MUST);
} else // <legacy>
if (match.getDocID() != null) {
bool.add(new TermQuery(new Term("ID", match.getDocID())), BooleanClause.Occur.MUST);
bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())), BooleanClause.Occur.MUST);
} else // </legacy>
// Invalid
{
match.addError(730, "Invalid match identifier", idString);
return match;
}
;
if (DEBUG)
log.trace("The bool query is {}", bool.toString());
Filter filter = (Filter) new QueryWrapperFilter(bool);
CompiledAutomaton fst = null;
if (info) {
/* Create an automaton for prefixed terms of interest.
* You can define the necessary foundry, the necessary layer,
* in case the foundry is given, and if span annotations
* are of interest.
*/
StringBuilder regex = new StringBuilder();
// TODO: Make these static
Pattern harmlessFoundry = Pattern.compile("^[-a-zA-Z0-9_]+$");
Pattern harmlessLayer = Pattern.compile("^[-a-zA-Z0-9_:]+$");
Iterator<String> iter;
int i = 0;
if (includeSpans)
regex.append("((\">\"|\"<\"\">\")\":\")?");
// There is a foundry given
if (foundry != null && foundry.size() > 0) {
// Filter out bad foundries
for (i = foundry.size() - 1; i >= 0; i--) {
if (!harmlessFoundry.matcher(foundry.get(i)).matches()) {
match.addError(970, "Invalid foundry requested", foundry.get(i));
return match;
}
;
}
;
// Build regex for multiple foundries
if (foundry.size() > 0) {
regex.append("(");
iter = foundry.iterator();
while (iter.hasNext()) {
regex.append(iter.next()).append("|");
}
;
regex.replace(regex.length() - 1, regex.length(), ")");
regex.append("\"/\"");
// There is a filter given
if (layer != null && layer.size() > 0) {
// Filter out bad layers
for (i = layer.size() - 1; i >= 0; i--) {
if (!harmlessLayer.matcher(layer.get(i)).matches()) {
throw new QueryException("Invalid layer requested: " + layer.get(i));
// layer.remove(i);
}
;
}
;
// Build regex for multiple layers
if (layer.size() > 0) {
regex.append("(");
iter = layer.iterator();
while (iter.hasNext()) {
regex.append(iter.next()).append("|");
}
;
regex.replace(regex.length() - 1, regex.length(), ")");
regex.append("\":\"");
}
;
}
;
}
;
} else if (includeSpans) {
// No foundries - but spans
regex.append("([^-is]|[-is][^:])");
} else {
// No foundries - no spans
regex.append("([^-is<>]|[-is>][^:]|<[^:>])");
}
;
regex.append("(.){1,}|_[0-9]+");
if (DEBUG)
log.trace("The final regexString is {}", regex.toString());
RegExp regexObj = new RegExp(regex.toString(), RegExp.COMPLEMENT);
fst = new CompiledAutomaton(regexObj.toAutomaton());
if (DEBUG)
log.trace("The final regexObj is {}", regexObj.toString());
}
;
try {
// Iterate over all atomic indices and find the matching document
for (LeafReaderContext atomic : this.reader().leaves()) {
// Retrieve the single document of interest
DocIdSet filterSet = filter.getDocIdSet(atomic, atomic.reader().getLiveDocs());
DocIdSetIterator filterIterator = filterSet.iterator();
if (DEBUG) {
// Create a bitset for the correct document
Bits bitset = filterSet.bits();
log.trace("Checking document in {} with {}", filterSet, bitset);
}
;
// No document found
if (filterIterator == null)
continue;
// Go to the matching doc - and remember its ID
int localDocID = filterIterator.nextDoc();
if (DEBUG)
log.trace("localDocID is {}", localDocID);
if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
continue;
// We've found the correct document! Hurray!
if (DEBUG)
log.trace("We've found a matching document");
// Get terms from the document
Terms docTerms = atomic.reader().getTermVector(localDocID, field);
// The following fields should be lifted for the match
HashSet<String> fields = (HashSet<String>) new Krill().getMeta().getFields().clone();
// Lift primary field
fields.add(field);
// Lift all fields
if (fields.contains("@all"))
fields = null;
// Load the necessary fields of the document
Document doc = atomic.reader().document(localDocID, fields);
// Put some more information to the match
PositionsToOffset pto = new PositionsToOffset(atomic, field);
match.setPositionsToOffset(pto);
match.setLocalDocID(localDocID);
match.populateDocument(doc, field, fields);
if (DEBUG)
log.trace("The document has the id '{}' or the sigle '{}'", match.getDocID(), match.getTextSigle());
// Todo:
SearchContext context = match.getContext();
// Override the normal match marking
// to have an inner match
match.overrideMatchPosition(match.getStartPos(), match.getEndPos() - 1);
// Search for minimal surrounding sentences
if (extendToSentence) {
String element = "base/s:s";
int[] spanContext = match.expandContextToSpan(element);
if (DEBUG)
log.trace("Extend to sentence element '{}'", element);
if (spanContext[0] >= 0 && spanContext[0] < spanContext[1]) {
match.setStartPos(spanContext[0]);
match.setEndPos(spanContext[1]);
match.potentialStartPosChar = spanContext[2];
match.potentialEndPosChar = spanContext[3];
match.startMore = false;
match.endMore = false;
} else {
match.addWarning(651, "Unable to extend context");
}
;
} else {
if (DEBUG)
log.trace("Don't expand context");
}
;
context.left.setToken(true).setLength(0);
context.right.setToken(true).setLength(0);
if (!info)
break;
// Limit the terms to all the terms of interest
TermsEnum termsEnum = docTerms.intersect(fst, null);
DocsAndPositionsEnum docs = null;
// List of terms to populate
SpanInfo termList = new SpanInfo(pto, localDocID);
// Iterate over all terms in the document
while (termsEnum.next() != null) {
// Get the positions and payloads of the term in the document
// The bitvector may look different (don't know why)
// and so the local ID may differ.
// That's why the requesting bitset is null.
docs = termsEnum.docsAndPositions(null, docs, DocsAndPositionsEnum.FLAG_PAYLOADS);
// Init document iterator
docs.nextDoc();
// Should never happen ... but hell!
if (docs.docID() == DocIdSetIterator.NO_MORE_DOCS)
continue;
// String representation of the term
String termString = termsEnum.term().utf8ToString();
// Iterate over all occurrences
for (int i = 0; i < docs.freq(); i++) {
// Init positions and get the current
int pos = docs.nextPosition();
// Check, if the position of the term is in the area of interest
if (pos >= match.getStartPos() && pos < match.getEndPos()) {
if (DEBUG)
log.trace(">> {}: freq:{}, pos:{}, payload:{}", termString, docs.freq(), pos, docs.getPayload());
BytesRef payload = docs.getPayload();
// Copy the payload
bbTerm.clear();
if (payload != null && payload.length <= bbTerm.capacity()) {
bbTerm.put(payload.bytes, payload.offset, payload.length);
}
;
TermInfo ti = new TermInfo(termString, pos, bbTerm).analyze();
if (ti.getEndPos() < match.getEndPos()) {
if (DEBUG)
log.trace("Add {}", ti.toString());
termList.add(ti);
}
;
}
;
}
;
}
;
// Add annotations based on the retrieved infos
for (TermInfo t : termList.getTerms()) {
if (DEBUG)
log.trace("Add term {}/{}:{} with char:{}(pos:{})-char:{}(pos:{})", t.getFoundry(), t.getLayer(), t.getValue(), t.getStartChar(), t.getStartPos(), t.getEndChar(), t.getEndPos());
// Ignore empty types for the moment
if (t.getType() == "term" || t.getType() == "span") {
match.addAnnotation(t.getStartPos(), t.getEndPos(), t.getAnnotation());
} else // Use relSrc for annotation views
if (t.getType() == "relSrc") {
// This only respects relSrc!
// May require more information for bidirectional relations
match.addRelation(t.getStartPos(), t.getEndPos(), t.getTargetStartPos(), t.getTargetEndPos(), t.getAnnotation());
}
;
}
;
break;
}
;
} catch (IOException e) {
match.addError(600, "Unable to read index", e.getLocalizedMessage());
log.warn(e.getLocalizedMessage());
}
;
return match;
}
use of de.ids_mannheim.korap.util.QueryException in project Krill by KorAP.
the class KrillQuery method _groupFromJson.
// Deserialize koral:group
private SpanQueryWrapper _groupFromJson(JsonNode json) throws QueryException {
// No operation
if (!json.has("operation"))
throw new QueryException(703, "Group expects operation");
// Get operation
String operation = json.get("operation").asText();
if (DEBUG)
log.trace("Found {} group", operation);
if (!json.has("operands"))
throw new QueryException(704, "Operation needs operand list");
// Get all operands
JsonNode operands = json.get("operands");
if (operands == null || !operands.isArray())
throw new QueryException(704, "Operation needs operand list");
if (DEBUG)
log.trace("Operands are {}", operands);
SpanQueryWrapper spanReferenceQueryWrapper = _operationReferenceFromJSON(json, operands);
if (spanReferenceQueryWrapper != null) {
return spanReferenceQueryWrapper;
}
// Branch on operation
switch(operation) {
case "operation:junction":
return this._operationJunctionFromJson(operands);
case "operation:position":
return this._operationPositionFromJson(json, operands);
case "operation:sequence":
return this._operationSequenceFromJson(json, operands);
case "operation:class":
return this._operationClassFromJson(json, operands);
case "operation:repetition":
return this._operationRepetitionFromJson(json, operands);
case "operation:relation":
// }
if (json.has("relType"))
return _operationRelationFromJson(operands, json.get("relType"));
else // EM: legacy
if (json.has("relation")) {
return _operationRelationFromJson(operands, json.get("relation"));
} else {
throw new QueryException(717, "Missing relation node");
}
// Gracefully warn on merge support
case "operation:merge":
this.addWarning(774, "Merge operation is currently not supported");
return _fromKoral(operands.get(0));
// Deprecated in favor of operation:junction
case "operation:or":
return this._operationJunctionFromJson(operands);
/*
case "operation:submatch": // Deprecated in favor of koral:reference
return this._operationSubmatchFromJson(json, operands);
*/
case "operation:disjunction":
return this._operationJunctionFromJson(operands);
}
;
// Unknown
throw new QueryException(711, "Unknown group operation");
}
use of de.ids_mannheim.korap.util.QueryException in project Krill by KorAP.
the class KrillQuery method _operationPositionFromJson.
// Deserialize operation:position
private SpanQueryWrapper _operationPositionFromJson(JsonNode json, JsonNode operands) throws QueryException {
if (operands.size() != 2)
throw new QueryException(705, "Number of operands is not acceptable");
String frame = "isAround";
// Temporary workaround for wrongly set overlaps
if (json.has("frames")) {
JsonNode frameN = json.get("frames");
if (frameN.isArray()) {
frameN = json.get("frames").get(0);
if (frameN != null && frameN.isValueNode())
frame = frameN.asText().substring(7);
}
;
} else // <legacyCode>
if (json.has("frame")) {
this.addMessage(0, "Frame is deprecated");
JsonNode frameN = json.get("frame");
if (frameN != null && frameN.isValueNode())
frame = frameN.asText().substring(6);
}
;
if (DEBUG)
log.trace("Position frame is '{}'", frame);
// Byte flag - should cover all 13 cases, i.e. two bytes long
byte flag = WITHIN;
switch(frame) {
case "isAround":
JsonNode operand = operands.get(0);
if (operand.get("@type").asText().equals("koral:token")) {
throw new QueryException(StatusCodes.INVALID_QUERY, "Token cannot contain another token or element.");
}
break;
case "strictlyContains":
flag = REAL_WITHIN;
break;
case "isWithin":
break;
case "startsWith":
flag = STARTSWITH;
break;
case "endsWith":
flag = ENDSWITH;
break;
case "matches":
flag = MATCH;
break;
case "overlaps":
flag = OVERLAP;
this.addWarning(769, "Overlap variant currently interpreted as overlap");
break;
case "overlapsLeft":
// Temporary workaround
this.addWarning(769, "Overlap variant currently interpreted as overlap");
flag = OVERLAP;
break;
case "overlapsRight":
// Temporary workaround
this.addWarning(769, "Overlap variant currently interpreted as overlap");
flag = OVERLAP;
break;
case "strictlyOverlaps":
flag = REAL_OVERLAP;
break;
default:
throw new QueryException(706, "Frame type is unknown");
}
;
// <legacyCode>
Boolean exclude;
if (json.has("exclude") && json.get("exclude").asBoolean()) {
throw new QueryException(760, "Exclusion is currently not supported in position operations");
}
;
// Create SpanWithin Query
return new SpanWithinQueryWrapper(this._fromKoral(operands.get(0)), this._fromKoral(operands.get(1)), flag);
}
use of de.ids_mannheim.korap.util.QueryException in project Krill by KorAP.
the class KrillQuery method _termFromJson.
// Deserialize koral:term
// TODO: Not optimal as it does not respect non-term
private SpanQueryWrapper _termFromJson(JsonNode json, boolean isSpan, RelationDirection direction) throws QueryException {
if (!json.has("@type")) {
throw new QueryException(701, "JSON-LD group has no @type attribute");
}
;
String termType = json.get("@type").asText();
Boolean isTerm = termType.equals("koral:term") ? true : false;
Boolean isCaseInsensitive = false;
if (!json.has("key") || json.get("key").asText().length() < 1) {
// why must it have an attr?
if (!json.has("attr")) {
// return new SpanRepetitionQueryWrapper();
throw new QueryException(740, "Key definition is missing in term or span");
}
}
;
// Empty koral:span hack
if (isSpan) {
isTerm = false;
}
;
// <legacy>
if (json.has("caseInsensitive") && json.get("caseInsensitive").asBoolean()) {
isCaseInsensitive = true;
} else // Flags
if (json.has("flags") && json.get("flags").isArray()) {
Iterator<JsonNode> flags = json.get("flags").elements();
while (flags.hasNext()) {
String flag = flags.next().asText();
if (flag.equals("flags:caseInsensitive")) {
isCaseInsensitive = true;
} else {
this.addWarning(748, "Flag is unknown", flag);
}
;
}
;
}
;
StringBuilder value = new StringBuilder();
if (direction != null)
value.append(direction.value());
if (json.has("foundry") && json.get("foundry").asText().length() > 0) {
value.append(json.get("foundry").asText()).append('/');
}
;
// No default foundry defined
if (json.has("layer") && json.get("layer").asText().length() > 0) {
String layer = json.get("layer").asText();
switch(layer) {
case "lemma":
layer = "l";
break;
case "pos":
layer = "p";
break;
case "orth":
// TODO: THIS IS AN UGLY HACK! AND SHOULD BE NAMED "SURFACE" or . OR *
layer = ".";
break;
case "struct":
layer = "s";
break;
case "const":
layer = "c";
break;
}
;
if (isCaseInsensitive && isTerm) {
if (layer.equals("."))
layer = "i";
else {
this.addWarning(767, "Case insensitivity is currently not supported for this layer");
}
;
}
;
// Ignore foundry for orth layer
if (layer.equals(".")) {
layer = "s";
value.setLength(0);
} else if (layer.equals("i")) {
value.setLength(0);
}
;
value.append(layer).append(':');
}
;
if (json.has("key") && json.get("key").asText().length() > 0) {
String key = json.get("key").asText();
value.append(isCaseInsensitive ? key.toLowerCase() : key);
}
;
if (json.has("value") && json.get("value").asText().length() > 0)
value.append(':').append(json.get("value").asText());
// Regular expression or wildcard
if (isTerm) {
String match = "match:eq";
if (json.has("match")) {
match = json.get("match").asText();
}
;
if (json.has("type")) {
QueryBuilder qb = this.builder();
// Branch on type
switch(json.get("type").asText()) {
case "type:regex":
{
// The regex can be rewritten to an any token
if (value.toString().matches("^[si]:\\.[\\+\\*]\\??$")) {
return new SpanRepetitionQueryWrapper();
}
;
SpanRegexQueryWrapper srqw = qb.re(value.toString(), isCaseInsensitive);
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
// ssqw.makeNegative();
return this.builder().seg().without(srqw);
} else if (match.equals("match:eq")) {
return srqw;
}
throw new QueryException(741, "Match relation unknown");
}
case "type:wildcard":
{
SpanWildcardQueryWrapper swcqw = qb.wc(value.toString(), isCaseInsensitive);
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
// ssqw.makeNegative();
return this.builder().seg().without(swcqw);
} else if (match.equals("match:eq")) {
return swcqw;
}
;
throw new QueryException(741, "Match relation unknown");
}
case "type:string":
break;
default:
this.addWarning(746, "Term type is not supported - treated as a string");
}
;
}
;
SpanSegmentQueryWrapper ssqw = this.builder().seg(value.toString());
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
ssqw.makeNegative();
return this.builder().seg().without(ssqw);
} else if (match.equals("match:eq")) {
return ssqw;
} else {
throw new QueryException(741, "Match relation unknown");
}
}
;
if (json.has("attr")) {
JsonNode attrNode = json.get("attr");
if (!attrNode.has("@type")) {
throw new QueryException(701, "JSON-LD group has no @type attribute");
}
if (value.toString().isEmpty()) {
return _createElementAttrFromJson(null, json, attrNode);
// this.addWarning(771,
// "Arbitraty elements with attributes are currently not supported.");
} else {
SpanQueryWrapper elementWithIdWrapper = this.builder().tag(value.toString());
if (elementWithIdWrapper == null) {
return null;
}
return _createElementAttrFromJson(elementWithIdWrapper, json, attrNode);
}
}
;
return this.builder().tag(value.toString());
}
use of de.ids_mannheim.korap.util.QueryException in project Krill by KorAP.
the class KrillQuery method _resolveReference.
private JsonNode _resolveReference(JsonNode node, JsonNode operands, int refOperandNum, int classNum) throws QueryException {
JsonNode referent = null;
ObjectMapper m = new ObjectMapper();
ArrayNode newOperands = m.createArrayNode();
boolean isReferentFound = false;
for (int i = 0; i < operands.size(); i++) {
if (i != refOperandNum) {
if (!isReferentFound) {
referent = _extractReferentClass(operands.get(i), classNum);
if (referent != null)
isReferentFound = true;
}
newOperands.insert(i, operands.get(i));
}
}
if (isReferentFound) {
newOperands.insert(refOperandNum, referent);
((ObjectNode) node).set("operands", newOperands);
return node;
} else
throw new QueryException("Referent node is not found");
}
Aggregations