use of org.tribuo.data.text.impl.BasicPipeline in project tribuo by oracle.
the class TestXGBoost method loadDataset.
private Dataset<Label> loadDataset(XGBoostModel<Label> model, Path path) throws IOException {
TextFeatureExtractor<Label> extractor = new TextFeatureExtractorImpl<>(new BasicPipeline(new BreakIteratorTokenizer(Locale.US), 2));
TextDataSource<Label> src = new SimpleTextDataSource<>(path, new LabelFactory(), extractor);
return new ImmutableDataset<>(src, model.getFeatureIDMap(), model.getOutputIDInfo(), false);
}
use of org.tribuo.data.text.impl.BasicPipeline in project tribuo by oracle.
the class JsonDataSourceTest method buildRowProcessor.
private static RowProcessor<MockOutput> buildRowProcessor() {
Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("height", new DoubleFieldProcessor("height"));
fieldProcessors.put("description", new TextFieldProcessor("description", new BasicPipeline(new BreakIteratorTokenizer(Locale.US), 2)));
fieldProcessors.put("transport", new IdentityProcessor("transport"));
Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>();
regexMappingProcessors.put("extra.*", new DoubleFieldProcessor("regex"));
ResponseProcessor<MockOutput> responseProcessor = new FieldResponseProcessor<>("disposition", "UNK", new MockOutputFactory());
List<FieldExtractor<?>> metadataExtractors = new ArrayList<>();
metadataExtractors.add(new IntExtractor("id"));
metadataExtractors.add(new DateExtractor("timestamp", "timestamp", "dd/MM/yyyy HH:mm"));
return new RowProcessor<>(metadataExtractors, null, responseProcessor, fieldProcessors, regexMappingProcessors, Collections.emptySet());
}
use of org.tribuo.data.text.impl.BasicPipeline in project tribuo by oracle.
the class LIMEColumnarTest method generateBinarisedDataset.
private Pair<RowProcessor<Label>, Dataset<Label>> generateBinarisedDataset() throws URISyntaxException {
LabelFactory labelFactory = new LabelFactory();
ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("A", new IdentityProcessor("A"));
fieldProcessors.put("B", new DoubleFieldProcessor("B"));
fieldProcessors.put("C", new DoubleFieldProcessor("C"));
fieldProcessors.put("D", new IdentityProcessor("D"));
fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
Dataset<Label> dataset = new MutableDataset<>(source);
return new Pair<>(rp, dataset);
}
use of org.tribuo.data.text.impl.BasicPipeline in project tribuo by oracle.
the class LIMEColumnarTest method generateCategoricalDataset.
private Pair<RowProcessor<Label>, Dataset<Label>> generateCategoricalDataset() throws URISyntaxException {
LabelFactory labelFactory = new LabelFactory();
ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("A", new IdentityProcessor("A") {
@Override
public GeneratedFeatureType getFeatureType() {
return GeneratedFeatureType.CATEGORICAL;
}
});
fieldProcessors.put("B", new DoubleFieldProcessor("B"));
fieldProcessors.put("C", new DoubleFieldProcessor("C"));
fieldProcessors.put("D", new IdentityProcessor("D") {
@Override
public GeneratedFeatureType getFeatureType() {
return GeneratedFeatureType.CATEGORICAL;
}
});
fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
Dataset<Label> dataset = new MutableDataset<>(source);
return new Pair<>(rp, dataset);
}
use of org.tribuo.data.text.impl.BasicPipeline in project tribuo by oracle.
the class TextPipelineTest method testBasicPipelineTagging.
@Test
public void testBasicPipelineTagging() {
String input = "This is some input text.";
BasicPipeline pipeline = new BasicPipeline(new BreakIteratorTokenizer(Locale.US), 2);
List<Feature> featureList = pipeline.process("Monkeys", input);
// logger.log(Level.INFO,featureList.toString());
assertTrue(featureList.contains(new Feature("Monkeys-1-N=This", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-1-N=is", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-1-N=some", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-1-N=input", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-1-N=text", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-2-N=This/is", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-2-N=is/some", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-2-N=some/input", 1.0)));
assertTrue(featureList.contains(new Feature("Monkeys-2-N=input/text", 1.0)));
}
Aggregations