use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.
the class JsonDataSourceTest method buildRowProcessor.
private static RowProcessor<MockOutput> buildRowProcessor() {
Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("height", new DoubleFieldProcessor("height"));
fieldProcessors.put("description", new TextFieldProcessor("description", new BasicPipeline(new BreakIteratorTokenizer(Locale.US), 2)));
fieldProcessors.put("transport", new IdentityProcessor("transport"));
Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>();
regexMappingProcessors.put("extra.*", new DoubleFieldProcessor("regex"));
ResponseProcessor<MockOutput> responseProcessor = new FieldResponseProcessor<>("disposition", "UNK", new MockOutputFactory());
List<FieldExtractor<?>> metadataExtractors = new ArrayList<>();
metadataExtractors.add(new IntExtractor("id"));
metadataExtractors.add(new DateExtractor("timestamp", "timestamp", "dd/MM/yyyy HH:mm"));
return new RowProcessor<>(metadataExtractors, null, responseProcessor, fieldProcessors, regexMappingProcessors, Collections.emptySet());
}
use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.
the class LIMEColumnarTest method generateBinarisedDataset.
private Pair<RowProcessor<Label>, Dataset<Label>> generateBinarisedDataset() throws URISyntaxException {
LabelFactory labelFactory = new LabelFactory();
ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("A", new IdentityProcessor("A"));
fieldProcessors.put("B", new DoubleFieldProcessor("B"));
fieldProcessors.put("C", new DoubleFieldProcessor("C"));
fieldProcessors.put("D", new IdentityProcessor("D"));
fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
Dataset<Label> dataset = new MutableDataset<>(source);
return new Pair<>(rp, dataset);
}
use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.
the class LIMEColumnarTest method generateCategoricalDataset.
private Pair<RowProcessor<Label>, Dataset<Label>> generateCategoricalDataset() throws URISyntaxException {
LabelFactory labelFactory = new LabelFactory();
ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("A", new IdentityProcessor("A") {
@Override
public GeneratedFeatureType getFeatureType() {
return GeneratedFeatureType.CATEGORICAL;
}
});
fieldProcessors.put("B", new DoubleFieldProcessor("B"));
fieldProcessors.put("C", new DoubleFieldProcessor("C"));
fieldProcessors.put("D", new IdentityProcessor("D") {
@Override
public GeneratedFeatureType getFeatureType() {
return GeneratedFeatureType.CATEGORICAL;
}
});
fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
Dataset<Label> dataset = new MutableDataset<>(source);
return new Pair<>(rp, dataset);
}
use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.
the class RowProcessorTest method replaceNewlinesWithSpacesTest.
@Test
public void replaceNewlinesWithSpacesTest() {
final Pattern BLANK_LINES = Pattern.compile("(\n[\\s-]*\n)+");
final Function<CharSequence, CharSequence> newLiner = (CharSequence charSequence) -> {
if (charSequence == null || charSequence.length() == 0) {
return charSequence;
}
return BLANK_LINES.splitAsStream(charSequence).collect(Collectors.joining(" *\n\n"));
};
Tokenizer tokenizer = new MungingTokenizer(new BreakIteratorTokenizer(Locale.US), newLiner);
TokenPipeline textPipeline = new TokenPipeline(tokenizer, 2, false);
final Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("order_text", new TextFieldProcessor("order_text", textPipeline));
MockResponseProcessor response = new MockResponseProcessor("Label");
Map<String, String> row = new HashMap<>();
row.put("order_text", "Jimmy\n\n\n\nHoffa");
row.put("Label", "Sheep");
RowProcessor<MockOutput> processor = new RowProcessor<>(Collections.emptyList(), null, response, fieldProcessors, Collections.emptyMap(), Collections.emptySet(), false);
Example<MockOutput> example = processor.generateExample(row, true).get();
// Check example is extracted correctly
assertEquals(5, example.size());
assertEquals("Sheep", example.getOutput().label);
Iterator<Feature> featureIterator = example.iterator();
Feature a = featureIterator.next();
assertEquals("order_text@1-N=*", a.getName());
assertEquals(1.0, a.getValue());
a = featureIterator.next();
assertEquals("order_text@1-N=Hoffa", a.getName());
a = featureIterator.next();
assertEquals("order_text@1-N=Jimmy", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=*/Hoffa", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=Jimmy/*", a.getName());
assertFalse(featureIterator.hasNext());
// same input with replaceNewlinesWithSpacesTest=true (the default) produces different features
processor = new RowProcessor<>(Collections.emptyList(), null, response, fieldProcessors, Collections.emptyMap(), Collections.emptySet(), true);
example = processor.generateExample(row, true).get();
// Check example is extracted correctly
assertEquals(3, example.size());
assertEquals("Sheep", example.getOutput().label);
featureIterator = example.iterator();
a = featureIterator.next();
assertEquals("order_text@1-N=Hoffa", a.getName());
assertEquals(1.0, a.getValue());
a = featureIterator.next();
assertEquals("order_text@1-N=Jimmy", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=Jimmy/Hoffa", a.getName());
assertFalse(featureIterator.hasNext());
}
Aggregations