Search in sources :

Example 1 with CrfSuiteAdapter

use of org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter in project dkpro-tc by dkpro.

the class CRFSuiteBrownPosDemoTest method runTrainTestNoFilter.

@Test
public void runTrainTestNoFilter() throws Exception {
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new CrfSuiteAdapter(), CrfSuiteAdapter.ALGORITHM_LBFGS, "-p", "max_iterations=5" });
    config.put(DIM_DATA_WRITER, new CrfSuiteAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new CrfSuiteAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = CRFSuiteBrownPosDemoSimpleDkproReader.getParameterSpace(Constants.FM_SEQUENCE, Constants.LM_SINGLE_LABEL, mlas, null);
    javaExperiment.runTrainTest(pSpace);
    assertEquals(1, ContextMemoryReport.id2outcomeFiles.size());
    List<String> lines = FileUtils.readLines(ContextMemoryReport.id2outcomeFiles.get(0), "utf-8");
    assertEquals(34, lines.size());
    assertEquals("#ID=PREDICTION;GOLDSTANDARD;THRESHOLD", lines.get(0));
    assertEquals("#labels 0=NN 1=JJ 2=NP 3=DTS 4=BEDZ 5=HV 6=PPO 7=DT 8=NNS 9=PPS 10=JJT 11=ABX 12=MD 13=DOD 14=VBD 15=VBG 16=QL 32=%28null%29 17=pct 18=CC 19=VBN 20=NPg 21=IN 22=WDT 23=BEN 24=VB 25=BER 26=AP 27=RB 28=CS 29=AT 30=HVD 31=TO", lines.get(1));
    // 2nd line time stamp
    // Crfsuite results are sensitive to some extend to the platform, to
    // account for this sensitivity we check only that the "prediction"
    // field is filled with any number but do not test for a specific value
    assertTrue(lines.get(3).matches("0000_0000_0000_The=[0-9]+;29;-1"));
    assertTrue(lines.get(4).matches("0000_0000_0001_bill=[0-9]+;0;-1"));
    assertTrue(lines.get(5).matches("0000_0000_0002_,=[0-9]+;17;-1"));
    assertTrue(lines.get(6).matches("0000_0000_0003_which=[0-9]+;22;-1"));
    assertTrue(lines.get(7).matches("0000_0000_0004_Daniel=[0-9]+;2;-1"));
}
Also used : HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) HashMap(java.util.HashMap) Map(java.util.Map) CrfSuiteAdapter(org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter) Test(org.junit.Test)

Example 2 with CrfSuiteAdapter

use of org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter in project dkpro-tc by dkpro.

the class CRFSuiteSaveAndLoadModelTest method loadModelArowParameters.

@Test
public void loadModelArowParameters() throws Exception {
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new CrfSuiteAdapter(), CrfSuiteAdapter.ALGORITHM_ADAPTIVE_REGULARIZATION_OF_WEIGHT_VECTOR, "-p", "max_iterations=2" });
    config.put(DIM_DATA_WRITER, new CrfSuiteAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    // create a model
    File modelFolder = folder.newFolder();
    ParameterSpace pSpace = getParameterSpace(mlas);
    executeSaveModelIntoTemporyFolder(pSpace, modelFolder);
    JCas jcas = JCasFactory.createJCas();
    jcas.setDocumentText("This is an example text. It has 2 sentences.");
    jcas.setDocumentLanguage("en");
    AnalysisEngine tokenizer = AnalysisEngineFactory.createEngine(BreakIteratorSegmenter.class);
    AnalysisEngine tcAnno = AnalysisEngineFactory.createEngine(TcAnnotator.class, TcAnnotator.PARAM_TC_MODEL_LOCATION, modelFolder.getAbsolutePath(), TcAnnotator.PARAM_NAME_SEQUENCE_ANNOTATION, Sentence.class.getName(), TcAnnotator.PARAM_NAME_UNIT_ANNOTATION, Token.class.getName());
    tokenizer.process(jcas);
    tcAnno.process(jcas);
    List<TextClassificationOutcome> outcomes = new ArrayList<>(JCasUtil.select(jcas, TextClassificationOutcome.class));
    // 9 token + 2 punctuation marks
    assertEquals(11, outcomes.size());
    for (TextClassificationOutcome o : outcomes) {
        assertTrue(postags.contains(o.getOutcome()));
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) CrfSuiteAdapter(org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) HashMap(java.util.HashMap) Map(java.util.Map) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine) Test(org.junit.Test)

Example 3 with CrfSuiteAdapter

use of org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter in project dkpro-tc by dkpro.

the class CRFSuiteNERSequenceDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_LANGUAGE, "de", SequenceOutcomeReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, SequenceOutcomeReader.PARAM_TOKEN_INDEX, 1, SequenceOutcomeReader.PARAM_OUTCOME_INDEX, 2, SequenceOutcomeReader.PARAM_SKIP_LINES_START_WITH_STRING, "#", SequenceOutcomeReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt");
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_LANGUAGE, "de", SequenceOutcomeReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, SequenceOutcomeReader.PARAM_TOKEN_INDEX, 1, SequenceOutcomeReader.PARAM_OUTCOME_INDEX, 2, SequenceOutcomeReader.PARAM_SKIP_LINES_START_WITH_STRING, "#", SequenceOutcomeReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt");
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    dimReaders.put(DIM_READER_TEST, readerTest);
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new CrfSuiteAdapter(), CrfSuiteAdapter.ALGORITHM_LBFGS, "-p", "max_iterations=5" });
    config.put(DIM_DATA_WRITER, new CrfSuiteAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new CrfSuiteAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(InitialCharacterUpperCase.class)));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, Constants.LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, Constants.FM_SEQUENCE), dimFeatureSets, mlas);
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) HashMap(java.util.HashMap) Map(java.util.Map) CrfSuiteAdapter(org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter)

Example 4 with CrfSuiteAdapter

use of org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter in project dkpro-tc by dkpro.

the class CRFSuiteBrownPosDemoTest method runTrainTestFilter.

@SuppressWarnings("unchecked")
@Test
public void runTrainTestFilter() throws Exception {
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new CrfSuiteAdapter(), CrfSuiteAdapter.ALGORITHM_ADAPTIVE_REGULARIZATION_OF_WEIGHT_VECTOR });
    config.put(DIM_DATA_WRITER, new CrfSuiteAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new CrfSuiteAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    Dimension<List<String>> dimFilter = Dimension.create(Constants.DIM_FEATURE_FILTERS, asList(FilterLuceneCharacterNgramStartingWithLetter.class.getName()));
    ParameterSpace pSpace = CRFSuiteBrownPosDemoSimpleDkproReader.getParameterSpace(Constants.FM_SEQUENCE, Constants.LM_SINGLE_LABEL, mlas, dimFilter);
    javaExperiment.runTrainTest(pSpace);
}
Also used : HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) List(java.util.List) Arrays.asList(java.util.Arrays.asList) HashMap(java.util.HashMap) Map(java.util.Map) CrfSuiteAdapter(org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter) Test(org.junit.Test)

Example 5 with CrfSuiteAdapter

use of org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter in project dkpro-tc by dkpro.

the class CRFSuiteSaveAndLoadModelTest method saveModel.

@Test
public void saveModel() throws Exception {
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new CrfSuiteAdapter(), CrfSuiteAdapter.ALGORITHM_ADAPTIVE_REGULARIZATION_OF_WEIGHT_VECTOR, "-p", "max_iterations=2" });
    config.put(DIM_DATA_WRITER, new CrfSuiteAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    File modelFolder = folder.newFolder();
    ParameterSpace pSpace = getParameterSpace(mlas);
    executeSaveModelIntoTemporyFolder(pSpace, modelFolder);
    File classifierFile = new File(modelFolder.getAbsolutePath() + "/" + MODEL_CLASSIFIER);
    assertTrue(classifierFile.exists());
    File parameterFile = new File(modelFolder.getAbsolutePath() + "/" + MODEL_FEATURE_EXTRACTOR_CONFIGURATION);
    assertTrue(parameterFile.exists());
    File metaOverride = new File(modelFolder.getAbsolutePath() + "/" + META_COLLECTOR_OVERRIDE);
    assertTrue(metaOverride.exists());
    File extractorOverride = new File(modelFolder.getAbsolutePath() + "/" + META_EXTRACTOR_OVERRIDE);
    assertTrue(extractorOverride.exists());
    File modelMetaFile = new File(modelFolder.getAbsolutePath() + "/" + MODEL_META);
    assertTrue(modelMetaFile.exists());
    File tcversion = new File(modelFolder.getAbsolutePath() + "/" + MODEL_TC_VERSION);
    assertTrue(tcversion.exists());
    File featureMode = new File(modelFolder.getAbsolutePath() + "/" + MODEL_FEATURE_MODE);
    assertTrue(featureMode.exists());
    File learningMode = new File(modelFolder.getAbsolutePath() + "/" + MODEL_LEARNING_MODE);
    assertTrue(learningMode.exists());
    modelFolder.deleteOnExit();
}
Also used : HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) HashMap(java.util.HashMap) Map(java.util.Map) File(java.io.File) CrfSuiteAdapter(org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) Test(org.junit.Test)

Aggregations

HashMap (java.util.HashMap)7 Map (java.util.Map)7 ParameterSpace (org.dkpro.lab.task.ParameterSpace)7 CrfSuiteAdapter (org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter)7 Test (org.junit.Test)5 File (java.io.File)3 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)3 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)2 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)2 ArrayList (java.util.ArrayList)2 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)2 JCas (org.apache.uima.jcas.JCas)2 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)2 Arrays.asList (java.util.Arrays.asList)1 List (java.util.List)1 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)1 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)1