use of edu.illinois.cs.cogcomp.thrift.curator.Record in project cogcomp-nlp by CogComp.
the class CuratorClient method addRecordViewFromCurator.
/**
* Does the network call to the Curator and fetches a record that has a particular view.
*
* @param text The raw text (this will be used if {@link #respectTokenization} is false.
* @param sentences The list of tokenized sentences (will be {@code null} if
* {@link #respectTokenization} is true.
* @param viewName The view to get (according to the Curator lingo.)
* @return A {@link edu.illinois.cs.cogcomp.thrift.curator.Record} with the requested view
*/
private Record addRecordViewFromCurator(String text, List<String> sentences, String viewName) throws ServiceUnavailableException, AnnotationFailedException, TException, SocketException {
viewName = convertCuratorViewName(viewName);
TTransport transport = new TSocket(this.curatorHost, this.curatorPort);
logger.debug("Calling curator on host '" + curatorHost + "', port '" + curatorPort + "' for view '" + viewName + "'...");
try {
((TSocket) transport).getSocket().setReuseAddress(true);
} catch (SocketException e) {
logger.error("Unable to setReuseAddress!", e);
throw e;
}
transport = new TFramedTransport(transport);
TProtocol protocol = new TBinaryProtocol(transport);
transport.open();
Curator.Client client = new Curator.Client(protocol);
Record newRecord;
if (respectTokenization) {
newRecord = client.wsprovide(viewName, sentences, forceUpdate);
} else {
newRecord = client.provide(viewName, text, forceUpdate);
}
transport.close();
return newRecord;
}
use of edu.illinois.cs.cogcomp.thrift.curator.Record in project cogcomp-nlp by CogComp.
the class CuratorClient method getTextAnnotation.
/**
* Creates a new
* {@link edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation} for the
* specified {@code text} belonging to the {@code corpusId} with id {@code textId}. This method
* calls the Curator to get the tokenization and the sentences unless the CuratorClient's
* {@link #respectTokenization} field is set to {@code true}, in which case it generates
* sentence and label views based on newlines and whitespace characters. (<b>NB:</b> tabs will
* be treated as tokens!) Consecutive whitespace characters will not generate empty tokens, but
* the token offsets and sentence offsets will count all whitespace characters.
* <p>
* <b> Note: </b> The {@code Record} returned by this method will not have any views except the
* {@code Sentence} and {@code Token} view. To get other views from the Curator, call the
* appropriate {@link CuratorClient} functions (e.g TODO).
*
* @param corpusId Identifier for the corpus
* @param textId Identifier for the text
* @param text The raw text
* @return A {@code TextAnnotation} with
* {@link edu.illinois.cs.cogcomp.core.datastructures.ViewNames#TOKENS} and
* {@link edu.illinois.cs.cogcomp.core.datastructures.ViewNames#SENTENCE} views.
*/
public TextAnnotation getTextAnnotation(String corpusId, String textId, String text) throws ServiceUnavailableException, AnnotationFailedException, TException, SocketException {
Record record = getRecord(text);
final Labeling tokensLabeling = record.getLabelViews().get(ViewNames.TOKENS);
final Labeling sentenceLabeling = record.getLabelViews().get(ViewNames.SENTENCE);
return CuratorDataStructureInterface.getTextAnnotationFromRecord(corpusId, textId, record, tokensLabeling, sentenceLabeling);
}
use of edu.illinois.cs.cogcomp.thrift.curator.Record in project cogcomp-nlp by CogComp.
the class CuratorClient method getRecord.
/**
* Creates a new {@link edu.illinois.cs.cogcomp.thrift.curator.Record} for the specified
* {@code text}. This method calls the Curator to get the tokenization and the sentences unless
* the CuratorClient's {@link #respectTokenization} field is set to {@code true}, in which case
* it generates sentence and label views based on newlines and whitespace characters.
* (<b>NB:</b> tabs will be treated as tokens!) Consecutive whitespace characters will not
* generate empty tokens, but the token offsets and sentence offsets will count all whitespace
* characters.
* <p>
* <b> Note: </b> The {@code Record} returned by this method will not have any views except the
* {@code Sentence} and {@code Token} view. To get other views from the Curator, call the
* appropriate {@link CuratorClient} functions (e.g TODO).
*
* @param text The text (tokenized or not)
* @return A {@link edu.illinois.cs.cogcomp.thrift.curator.Record} with
* {@link edu.illinois.cs.cogcomp.core.datastructures.ViewNames#TOKENS} and
* {@link edu.illinois.cs.cogcomp.core.datastructures.ViewNames#SENTENCE} views.
*/
private Record getRecord(String text) throws ServiceUnavailableException, AnnotationFailedException, TException, SocketException {
// Instantiate a basic record for a given text with a curator-compatible identifier
// and initialized empty view collections
Record record = new Record();
record.setRawText(text);
record.setLabelViews(new TreeMap<String, Labeling>());
record.setParseViews(new TreeMap<String, Forest>());
record.setClusterViews(new TreeMap<String, Clustering>());
record.setViews(new TreeMap<String, View>());
record.setIdentifier(Identifier.getId(text, respectTokenization));
if (respectTokenization) {
List<String> inputs = new LinkedList<>();
String[] sentences = text.split(System.getProperty("line.separator"));
for (String sentence : sentences) if (sentence.length() > 0)
inputs.add(sentence);
Labeling sents = RecordUtils.sentences(inputs);
record.getLabelViews().put(ViewNames.SENTENCE, sents);
Labeling tokens = RecordUtils.tokenize(inputs);
record.getLabelViews().put(ViewNames.TOKENS, tokens);
} else {
addRecordView(record, ViewNames.TOKENS);
addRecordView(record, ViewNames.SENTENCE);
}
return record;
}
use of edu.illinois.cs.cogcomp.thrift.curator.Record in project cogcomp-nlp by CogComp.
the class CuratorClient method getTextAnnotationView.
public edu.illinois.cs.cogcomp.core.datastructures.textannotation.View getTextAnnotationView(TextAnnotation ta, String viewName) throws TException, AnnotationFailedException, ServiceUnavailableException, SocketException {
edu.illinois.cs.cogcomp.core.datastructures.textannotation.View view;
Record record = addRecordViewFromCurator(ta.getText(), TextAnnotationUtilities.getSentenceList(ta), viewName);
ViewTypes viewType = ViewNames.getViewType(viewName);
if (viewType == ViewTypes.TOKEN_LABEL_VIEW) {
Labeling labeling = record.getLabelViews().get(convertCuratorViewName(viewName));
view = CuratorDataStructureInterface.alignLabelingToTokenLabelView(viewName, ta, labeling);
} else if (viewType == ViewTypes.SPAN_LABEL_VIEW) {
boolean allowOverlappingSpans = false;
if (viewName.equals(ViewNames.WIKIFIER))
allowOverlappingSpans = true;
Labeling labeling = record.getLabelViews().get(convertCuratorViewName(viewName));
view = CuratorDataStructureInterface.alignLabelingToSpanLabelView(viewName, ta, labeling, allowOverlappingSpans);
} else if (viewType == ViewTypes.DEPENDENCY_VIEW) {
Forest depForest = record.getParseViews().get(convertCuratorViewName(viewName));
if (depForest.trees.size() > TextAnnotationUtilities.getSentenceList(ta).size())
throw new AnnotationFailedException("mismatched number of trees and sentences.");
view = CuratorDataStructureInterface.alignForestToDependencyView(viewName, ta, depForest);
} else if (viewType == ViewTypes.PARSE_VIEW) {
Forest parseForest = record.getParseViews().get(convertCuratorViewName(viewName));
if (parseForest.trees.size() > TextAnnotationUtilities.getSentenceList(ta).size())
throw new AnnotationFailedException("mismatched number of trees and sentences.");
view = CuratorDataStructureInterface.alignForestToParseTreeView(viewName, ta, parseForest);
} else if (viewType == ViewTypes.PREDICATE_ARGUMENT_VIEW) {
Forest forest = record.getParseViews().get(convertCuratorViewName(viewName));
view = CuratorDataStructureInterface.alignForestToPredicateArgumentView(viewName, ta, forest);
} else if (viewType == ViewTypes.COREF_VIEW) {
Clustering corefClustering = record.getClusterViews().get(convertCuratorViewName(viewName));
view = CuratorDataStructureInterface.alignClusteringToCoreferenceView(viewName, ta, corefClustering);
} else
throw new AnnotationFailedException("Unrecognised view type " + viewType);
return view;
}
use of edu.illinois.cs.cogcomp.thrift.curator.Record in project cogcomp-nlp by CogComp.
the class CuratorClient method addRecordView.
/**
* Adds a view to a {@link edu.illinois.cs.cogcomp.thrift.curator.Record}.
*
* @param record The {@link edu.illinois.cs.cogcomp.thrift.curator.Record} to annotate
* @param viewName The view to add
*/
private void addRecordView(Record record, String viewName) throws TException, AnnotationFailedException, ServiceUnavailableException, SocketException {
Record newRecord = addRecordViewFromCurator(record.getRawText(), RecordUtils.getSentenceList(record), viewName);
if (ViewNames.getViewType(viewName) == ViewTypes.TOKEN_LABEL_VIEW || ViewNames.getViewType(viewName) == ViewTypes.SPAN_LABEL_VIEW) {
Map<String, Labeling> labelViews = newRecord.getLabelViews();
record.labelViews.put(viewName, labelViews.get(convertCuratorViewName(viewName)));
} else if (ViewNames.getViewType(viewName) == ViewTypes.COREF_VIEW) {
Map<String, Clustering> clusterViews = newRecord.getClusterViews();
record.clusterViews.put(viewName, clusterViews.get(convertCuratorViewName(viewName)));
} else if (ViewNames.getViewType(viewName) == ViewTypes.DEPENDENCY_VIEW || ViewNames.getViewType(viewName) == ViewTypes.PARSE_VIEW || ViewNames.getViewType(viewName) == ViewTypes.PREDICATE_ARGUMENT_VIEW) {
Map<String, Forest> parseViews = newRecord.getParseViews();
record.parseViews.put(viewName, parseViews.get(convertCuratorViewName(viewName)));
}
}
Aggregations