Search in sources :

Example 16 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class LibsvmDataFormatWriter method writeClassifierFormat.

@Override
public void writeClassifierFormat(Collection<Instance> in) throws AnalysisEngineProcessException {
    try {
        if (featureNames2id == null) {
            createFeatureNameMap();
        }
        initClassifierFormat();
        List<Instance> instances = new ArrayList<>(in);
        for (Instance instance : instances) {
            Map<Integer, Double> entry = new HashMap<>();
            recordInstanceId(instance, maxId++, index2instanceId);
            for (Feature f : instance.getFeatures()) {
                Integer id = featureNames2id.get(f.getName());
                Double val = toValue(f.getValue());
                if (Math.abs(val) < 0.00000001) {
                    // skip zero values
                    continue;
                }
                entry.put(id, val);
            }
            List<Integer> keys = new ArrayList<Integer>(entry.keySet());
            Collections.sort(keys);
            if (isRegression()) {
                bw.append(instance.getOutcome() + "\t");
            } else {
                bw.append(outcomeMap.get(instance.getOutcome()) + "\t");
            }
            bw.append(injectSequenceId(instance));
            for (int i = 0; i < keys.size(); i++) {
                Integer key = keys.get(i);
                Double value = entry.get(key);
                bw.append("" + key.toString() + ":" + value.toString());
                if (i + 1 < keys.size()) {
                    bw.append("\t");
                }
            }
            bw.append("\n");
        }
        writeMapping(outputDirectory, INDEX2INSTANCEID, index2instanceId);
        writeFeatureName2idMapping(outputDirectory, AdapterFormat.getFeatureNameMappingFilename(), featureNames2id);
        writeOutcomeMapping(outputDirectory, AdapterFormat.getOutcomeMappingFilename(), outcomeMap);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    } finally {
        IOUtils.closeQuietly(bw);
        // important, we reopen the stream only if the pointer is null!
        bw = null;
    }
}
Also used : Instance(org.dkpro.tc.api.features.Instance) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Feature(org.dkpro.tc.api.features.Feature) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException)

Example 17 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class LibsvmDataFormatWriter method transformFromGeneric.

@Override
public void transformFromGeneric() throws Exception {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(outputDirectory, Constants.GENERIC_FEATURE_FILE)), "utf-8"));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Instance[] instance = gson.fromJson(line, Instance[].class);
        List<Instance> ins = new ArrayList<>(Arrays.asList(instance));
        writeClassifierFormat(ins);
    }
    reader.close();
    FileUtils.deleteQuietly(new File(outputDirectory, Constants.GENERIC_FEATURE_FILE));
}
Also used : InputStreamReader(java.io.InputStreamReader) Instance(org.dkpro.tc.api.features.Instance) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 18 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class CrfSuiteDataWriter method writeClassifierFormat.

@Override
public void writeClassifierFormat(Collection<Instance> instances) throws AnalysisEngineProcessException {
    try {
        initClassifierFormat();
        Iterator<StringBuilder> sequenceIterator = new CrfSuiteFeatureFormatExtractionIterator(new ArrayList<Instance>(instances));
        while (sequenceIterator.hasNext()) {
            String features = sequenceIterator.next().toString();
            bw.write(features);
            bw.write("\n");
        }
        bw.close();
        bw = null;
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
}
Also used : Instance(org.dkpro.tc.api.features.Instance) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException)

Example 19 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class CrfSuiteDataWriter method transformFromGeneric.

@Override
public void transformFromGeneric() throws Exception {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(outputDirectory, Constants.GENERIC_FEATURE_FILE)), "utf-8"));
    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(classifierFormatOutputFile), "utf-8"));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Instance[] instance = gson.fromJson(line, Instance[].class);
        List<Instance> ins = new ArrayList<>(Arrays.asList(instance));
        Iterator<StringBuilder> sequenceIterator = new CrfSuiteFeatureFormatExtractionIterator(ins);
        while (sequenceIterator.hasNext()) {
            String features = sequenceIterator.next().toString();
            writer.write(features);
            writer.write("\n");
        }
    }
    reader.close();
    writer.close();
}
Also used : InputStreamReader(java.io.InputStreamReader) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) FileInputStream(java.io.FileInputStream) BufferedWriter(java.io.BufferedWriter) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File)

Example 20 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class ExtractFeaturesConnectorTest method extractFeaturesConnectorSingleLabelTest.

@Test
public void extractFeaturesConnectorSingleLabelTest() throws Exception {
    File outputPath = folder.newFolder();
    // we do not need parameters here, but in case we do :)
    Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
    ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
    List<ExternalResourceDescription> fes = new ArrayList<>();
    fes.add(featureExtractor);
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderSingleLabel.class, TestReaderSingleLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
    SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
    Gson gson = new Gson();
    System.out.println(FileUtils.readFileToString(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8"));
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    List<Instance> instances = new ArrayList<>();
    for (String l : lines) {
        instances.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(2, instances.size());
    assertEquals(1, getUniqueOutcomes(instances));
}
Also used : JsonDataWriter(org.dkpro.tc.core.io.JsonDataWriter) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) Test(org.junit.Test)

Aggregations

Instance (org.dkpro.tc.api.features.Instance)61 ArrayList (java.util.ArrayList)38 Feature (org.dkpro.tc.api.features.Feature)30 Test (org.junit.Test)27 File (java.io.File)17 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)14 IOException (java.io.IOException)12 Gson (com.google.gson.Gson)8 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)8 Attribute (weka.core.Attribute)8 DenseInstance (weka.core.DenseInstance)8 Instances (weka.core.Instances)8 SparseInstance (weka.core.SparseInstance)8 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)6 BufferedReader (java.io.BufferedReader)5 FileInputStream (java.io.FileInputStream)5 InputStreamReader (java.io.InputStreamReader)5 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)5 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)5 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)5