use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.
the class Test method getTermVector.
public static MultiTermTokenStream getTermVector(String stream) {
MultiTermTokenStream ts = new MultiTermTokenStream();
int pos = 0;
for (String seg : stream.split(" ")) {
String[] tokseg = seg.split("\\|");
try {
MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
mtt.add("T");
mtt.add('i', tokseg[0].toLowerCase());
mtt.add('p', tokseg[1]);
mtt.add('l', tokseg[2]);
if (tokseg.length == 4) {
for (String morph : tokseg[3].split(";")) {
mtt.add('m', morph);
}
}
;
if (tokseg.length == 5) {
mtt.add('e', tokseg[4]);
}
;
ts.addMultiTermToken(mtt);
} catch (CorpusDataException cde) {
fail(cde.getErrorCode() + ": " + cde.getMessage());
}
;
}
;
return ts;
}
use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.
the class MultiTerm method _fromString.
/*
* Deserialize MultiTerm from string representation.
*/
private void _fromString(String term) throws CorpusDataException {
String[] termSurface = term.split("(?<!\\\\)\\$", 2);
// Payload is given
if (termSurface.length == 2) {
String payloadStr = termSurface[1];
// Payload has a type
if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
// Rewind bytebuffer
bb.rewind();
// Split payload at type marker boundaries
String[] pls = payloadStr.split("((?=<)|(?<=>))(?!\\A)");
// Bytearray length
l = 0;
try {
for (i = 1; i < pls.length; ) {
// Resize the bytebuffer
if ((bb.capacity() - l) < 8) {
bb = ByteBuffer.allocate(bb.capacity() + 8).put(bb.array());
bb.position(l);
}
;
switch(pls[i - 1]) {
case // byte
"<b>":
bb.put(Byte.parseByte(pls[i]));
l++;
break;
case // short
"<s>":
bb.putShort(Short.parseShort(pls[i]));
l += 2;
break;
case // integer
"<i>":
bb.putInt(Integer.parseInt(pls[i]));
l += 4;
break;
case // long
"<l>":
bb.putLong(Long.parseLong(pls[i]));
l += 8;
break;
}
;
i += 2;
}
;
byte[] bytes = new byte[l];
System.arraycopy(bb.array(), 0, bytes, 0, l);
this.payload = new BytesRef(bytes);
} catch (Exception e) {
if (DEBUG)
log.warn(e.getMessage());
}
;
} else // Payload is a string
{
this.payload = new BytesRef(payloadStr);
}
;
}
;
// Parse offset information
stringOffset = termSurface[0].split("(?<!\\\\)\\#", 2);
if (stringOffset.length == 2) {
// Split start and end position of the offset
String[] offset = stringOffset[1].split("\\-", 2);
// Start and end is given
if (offset.length == 2 && offset[0].length() > 0) {
try {
this.start = Integer.parseInt(offset[0]);
this.end = Integer.parseInt(offset[1]);
} catch (NumberFormatException e) {
throw new CorpusDataException(952, "Given offset information is not numeric in " + termSurface[0]);
}
;
} else {
throw new CorpusDataException(953, "Given offset information is incomplete in " + termSurface[0]);
}
;
}
;
this.term = _unescape(stringOffset[0]);
}
use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.
the class FieldDocument method setFields.
/**
* Deserialize token stream data (LEGACY).
*/
public void setFields(ArrayList<Map<String, Object>> fields) {
Map<String, Object> primary = fields.remove(0);
this.setPrimaryData((String) primary.get("primaryData"));
for (Map<String, Object> field : fields) {
String fieldName = (String) field.get("name");
MultiTermTokenStream mtts = this.newMultiTermTokenStream();
for (ArrayList<String> token : (ArrayList<ArrayList<String>>) field.get("data")) {
try {
MultiTermToken mtt = new MultiTermToken(token.remove(0));
for (String term : token) {
mtt.add(term);
}
;
mtts.addMultiTermToken(mtt);
} catch (CorpusDataException cde) {
this.addError(cde.getErrorCode(), cde.getMessage());
}
;
}
;
// as meta fields in the tokenization term vector
if (field.containsKey("foundries")) {
// TODO: Do not store positions!
String foundries = (String) field.get("foundries");
this.addKeyword("foundries", foundries);
super.setFoundries(foundries);
}
;
if (field.containsKey("tokenization")) {
String tokenization = (String) field.get("tokenization");
this.addString("tokenization", tokenization);
super.setTokenization(tokenization);
}
;
this.addTV(fieldName, this.getPrimaryData(), mtts);
}
;
}
use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.
the class FieldDocument method setData.
/**
* Deserialize token stream data.
*/
public void setData(Map<String, Object> node) {
this.setPrimaryData((String) node.get("text"));
String fieldName = (String) node.get("name");
MultiTermTokenStream mtts = this.newMultiTermTokenStream();
// Iterate over all tokens in stream
for (ArrayList<String> token : (ArrayList<ArrayList<String>>) node.get("stream")) {
try {
// Initialize MultiTermToken
MultiTermToken mtt = new MultiTermToken(token.remove(0));
// Add rest of the list
for (String term : token) {
mtt.add(term);
}
;
// Add MultiTermToken to stream
mtts.addMultiTermToken(mtt);
} catch (CorpusDataException cde) {
this.addError(cde.getErrorCode(), cde.getMessage());
}
;
}
;
// Add tokenstream to fielddocument
this.addTV(fieldName, this.getPrimaryData(), mtts);
// Get foundry info
if (node.containsKey("foundries"))
this.setFoundries((String) node.get("foundries"));
// Get layer info
if (node.containsKey("layerInfos"))
this.setLayerInfos((String) node.get("layerInfos"));
// Get tokenSource info
if (node.containsKey("tokenSource"))
this.setTokenSource((String) node.get("tokenSource"));
}
Aggregations