Search in sources :

Example 1 with InitTaskDeep

use of org.dkpro.tc.core.task.deep.InitTaskDeep in project dkpro-tc by dkpro.

the class DeepLearningExperimentCrossValidation method init.

/**
 * Initializes the experiment. This is called automatically before execution. It's not done
 * directly in the constructor, because we want to be able to use setters instead of the
 * three-argument constructor.
 *
 * @throws IllegalStateException
 *             in case of errors
 */
protected void init() throws IllegalStateException {
    if (experimentName == null) {
        throw new IllegalStateException("You must set an experiment name");
    }
    if (numFolds < 2) {
        throw new IllegalStateException("Number of folds is not configured correctly. Number of folds needs to be at " + "least 2 (but was " + numFolds + ")");
    }
    // initialize the setup
    initTask = new InitTaskDeep();
    initTask.setPreprocessing(getPreprocessing());
    initTask.setOperativeViews(operativeViews);
    initTask.setType(initTask.getType() + "-" + experimentName);
    initTask.setAttribute(TC_TASK_TYPE, TcTaskType.INIT_TRAIN.toString());
    // inner batch task (carried out numFolds times)
    DefaultBatchTask crossValidationTask = new DefaultBatchTask() {

        @Discriminator(name = DIM_FEATURE_MODE)
        private String featureMode;

        @Discriminator(name = DIM_CROSS_VALIDATION_MANUAL_FOLDS)
        private boolean useCrossValidationManualFolds;

        @Override
        public void initialize(TaskContext aContext) {
            super.initialize(aContext);
            File xmiPathRoot = aContext.getFolder(InitTask.OUTPUT_KEY_TRAIN, AccessMode.READONLY);
            Collection<File> files = FileUtils.listFiles(xmiPathRoot, new String[] { "bin" }, true);
            String[] fileNames = new String[files.size()];
            int i = 0;
            for (File f : files) {
                // adding file paths, not names
                fileNames[i] = f.getAbsolutePath();
                i++;
            }
            Arrays.sort(fileNames);
            if (numFolds == LEAVE_ONE_OUT) {
                numFolds = fileNames.length;
            }
            // manual mode is turned off
            if (!useCrossValidationManualFolds && fileNames.length < numFolds) {
                xmiPathRoot = createRequestedNumberOfCas(xmiPathRoot, fileNames.length, featureMode);
                files = FileUtils.listFiles(xmiPathRoot, new String[] { "bin" }, true);
                fileNames = new String[files.size()];
                i = 0;
                for (File f : files) {
                    // adding file paths, not names
                    fileNames[i] = f.getAbsolutePath();
                    i++;
                }
            }
            // don't change any names!!
            FoldDimensionBundle<String> foldDim = getFoldDim(fileNames);
            Dimension<File> filesRootDim = Dimension.create(DIM_FILES_ROOT, xmiPathRoot);
            ParameterSpace pSpace = new ParameterSpace(foldDim, filesRootDim);
            setParameterSpace(pSpace);
        }

        /**
         * creates required number of CAS
         *
         * @param xmiPathRoot
         *            input path
         * @param numAvailableJCas
         *            all CAS
         * @param featureMode
         *            the feature mode
         * @return a file
         */
        private File createRequestedNumberOfCas(File xmiPathRoot, int numAvailableJCas, String featureMode) {
            try {
                File outputFolder = FoldUtil.createMinimalSplit(xmiPathRoot.getAbsolutePath(), numFolds, numAvailableJCas, FM_SEQUENCE.equals(featureMode));
                if (outputFolder == null) {
                    throw new NullPointerException("Output folder is null");
                }
                verfiyThatNeededNumberOfCasWasCreated(outputFolder);
                return outputFolder;
            } catch (Exception e) {
                throw new IllegalStateException(e);
            }
        }

        private void verfiyThatNeededNumberOfCasWasCreated(File outputFolder) {
            int numCas = 0;
            File[] listFiles = outputFolder.listFiles();
            if (listFiles == null) {
                throw new NullPointerException("Retrieving files in folder led to a NullPointer");
            }
            for (File f : listFiles) {
                if (f.getName().contains(".bin")) {
                    numCas++;
                }
            }
            if (numCas < numFolds) {
                throw new IllegalStateException("Not enough TextClassificationUnits found to create at least [" + numFolds + "] folds");
            }
        }
    };
    // ================== SUBTASKS OF THE INNER BATCH TASK
    // =======================
    // collecting meta features only on the training data (numFolds times)
    // get some meta data depending on the whole document collection
    preparationTask = new PreparationTask();
    preparationTask.setType(preparationTask.getType() + "-" + experimentName);
    preparationTask.setMachineLearningAdapter(mlAdapter);
    preparationTask.addImport(initTask, InitTask.OUTPUT_KEY_TRAIN, PreparationTask.INPUT_KEY_TRAIN);
    preparationTask.setAttribute(TC_TASK_TYPE, TcTaskType.PREPARATION.toString());
    embeddingTask = new EmbeddingTask();
    embeddingTask.setType(embeddingTask.getType() + "-" + experimentName);
    embeddingTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, EmbeddingTask.INPUT_MAPPING);
    embeddingTask.setAttribute(TC_TASK_TYPE, TcTaskType.EMBEDDING.toString());
    // feature extraction on training data
    vectorizationTrainTask = new VectorizationTask();
    vectorizationTrainTask.setType(vectorizationTrainTask.getType() + "-Train-" + experimentName);
    vectorizationTrainTask.setTesting(false);
    vectorizationTrainTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
    vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TRAIN.toString());
    // feature extraction on test data
    vectorizationTestTask = new VectorizationTask();
    vectorizationTestTask.setType(vectorizationTestTask.getType() + "-Test-" + experimentName);
    vectorizationTestTask.setTesting(true);
    vectorizationTestTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
    vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TEST.toString());
    // test task operating on the models of the feature extraction train and
    // test tasks
    learningTask = mlAdapter.getTestTask();
    learningTask.setType(learningTask.getType() + "-" + experimentName);
    learningTask.setAttribute(TC_TASK_TYPE, TcTaskType.MACHINE_LEARNING_ADAPTER.toString());
    if (innerReports != null) {
        for (Class<? extends Report> report : innerReports) {
            learningTask.addReport(report);
        }
    }
    // // always add OutcomeIdReport
    learningTask.addReport(mlAdapter.getOutcomeIdReportClass());
    learningTask.addReport(mlAdapter.getMajorityBaselineIdReportClass());
    learningTask.addReport(mlAdapter.getRandomBaselineIdReportClass());
    learningTask.addReport(mlAdapter.getMetaCollectionReport());
    learningTask.addReport(BasicResultReport.class);
    learningTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, TcDeepLearningAdapter.PREPARATION_FOLDER);
    learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TRAINING_DATA);
    learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TEST_DATA);
    learningTask.addImport(embeddingTask, EmbeddingTask.OUTPUT_KEY, TcDeepLearningAdapter.EMBEDDING_FOLDER);
    learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TRAIN_OUTPUT);
    learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TRAIN);
    learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TEST_OUTPUT);
    learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TEST);
    // ================== CONFIG OF THE INNER BATCH TASK
    // =======================
    crossValidationTask.addImport(initTask, InitTask.OUTPUT_KEY_TRAIN);
    crossValidationTask.setType(crossValidationTask.getType() + "-" + experimentName);
    crossValidationTask.addTask(preparationTask);
    crossValidationTask.addTask(embeddingTask);
    crossValidationTask.addTask(vectorizationTrainTask);
    crossValidationTask.addTask(vectorizationTestTask);
    crossValidationTask.addTask(learningTask);
    crossValidationTask.setExecutionPolicy(ExecutionPolicy.USE_EXISTING);
    // report of the inner batch task (sums up results for the folds)
    // we want to re-use the old CV report, we need to collect the
    // evaluation.bin files from
    // the test task here (with another report)
    crossValidationTask.addReport(DeepLearningInnerBatchReport.class);
    crossValidationTask.setAttribute(TC_TASK_TYPE, TcTaskType.CROSS_VALIDATION.toString());
    // DKPro Lab issue 38: must be added as *first* task
    addTask(initTask);
    addTask(crossValidationTask);
}
Also used : TaskContext(org.dkpro.lab.engine.TaskContext) PreparationTask(org.dkpro.tc.core.task.deep.PreparationTask) InitTaskDeep(org.dkpro.tc.core.task.deep.InitTaskDeep) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ParameterSpace(org.dkpro.lab.task.ParameterSpace) VectorizationTask(org.dkpro.tc.core.task.deep.VectorizationTask) File(java.io.File) EmbeddingTask(org.dkpro.tc.core.task.deep.EmbeddingTask) DefaultBatchTask(org.dkpro.lab.task.impl.DefaultBatchTask)

Example 2 with InitTaskDeep

use of org.dkpro.tc.core.task.deep.InitTaskDeep in project dkpro-tc by dkpro.

the class DeepLearningExperimentTrainTest method init.

/**
 * Initializes the experiment. This is called automatically before execution. It's not done
 * directly in the constructor, because we want to be able to use setters instead of the
 * arguments in the constructor.
 */
@Override
protected void init() {
    if (experimentName == null) {
        throw new IllegalStateException("You must set an experiment name");
    }
    // init the train part of the experiment
    initTaskTrain = new InitTaskDeep();
    initTaskTrain.setPreprocessing(getPreprocessing());
    initTaskTrain.setOperativeViews(operativeViews);
    initTaskTrain.setTesting(false);
    initTaskTrain.setType(initTaskTrain.getType() + "-Train-" + experimentName);
    initTaskTrain.setAttribute(TC_TASK_TYPE, TcTaskType.INIT_TRAIN.toString());
    // init the test part of the experiment
    initTaskTest = new InitTaskDeep();
    initTaskTest.setTesting(true);
    initTaskTest.setPreprocessing(getPreprocessing());
    initTaskTest.setOperativeViews(operativeViews);
    initTaskTest.setType(initTaskTest.getType() + "-Test-" + experimentName);
    initTaskTest.setAttribute(TC_TASK_TYPE, TcTaskType.INIT_TEST.toString());
    // get some meta data depending on the whole document collection
    preparationTask = new PreparationTask();
    preparationTask.setType(preparationTask.getType() + "-" + experimentName);
    preparationTask.setMachineLearningAdapter(mlAdapter);
    preparationTask.addImport(initTaskTrain, InitTask.OUTPUT_KEY_TRAIN, PreparationTask.INPUT_KEY_TRAIN);
    preparationTask.addImport(initTaskTest, InitTask.OUTPUT_KEY_TEST, PreparationTask.INPUT_KEY_TEST);
    preparationTask.setAttribute(TC_TASK_TYPE, TcTaskType.PREPARATION.toString());
    embeddingTask = new EmbeddingTask();
    embeddingTask.setType(embeddingTask.getType() + "-" + experimentName);
    embeddingTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, EmbeddingTask.INPUT_MAPPING);
    embeddingTask.setAttribute(TC_TASK_TYPE, TcTaskType.EMBEDDING.toString());
    // feature extraction on training data
    vectorizationTrainTask = new VectorizationTask();
    vectorizationTrainTask.setType(vectorizationTrainTask.getType() + "-Train-" + experimentName);
    vectorizationTrainTask.setTesting(false);
    vectorizationTrainTask.addImport(initTaskTrain, InitTaskDeep.OUTPUT_KEY_TRAIN, VectorizationTask.DATA_INPUT_KEY);
    vectorizationTrainTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
    vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TRAIN.toString());
    // feature extraction on test data
    vectorizationTestTask = new VectorizationTask();
    vectorizationTestTask.setType(vectorizationTestTask.getType() + "-Test-" + experimentName);
    vectorizationTestTask.setTesting(true);
    vectorizationTestTask.addImport(initTaskTest, InitTaskDeep.OUTPUT_KEY_TEST, VectorizationTask.DATA_INPUT_KEY);
    vectorizationTestTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
    vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TEST.toString());
    learningTask = mlAdapter.getTestTask();
    learningTask.setType(learningTask.getType() + "-" + experimentName);
    learningTask.setAttribute(TC_TASK_TYPE, TcTaskType.MACHINE_LEARNING_ADAPTER.toString());
    if (innerReports != null) {
        for (Class<? extends Report> report : innerReports) {
            learningTask.addReport(report);
        }
    }
    // // always add OutcomeIdReport
    learningTask.addReport(mlAdapter.getOutcomeIdReportClass());
    learningTask.addReport(mlAdapter.getMajorityBaselineIdReportClass());
    learningTask.addReport(mlAdapter.getRandomBaselineIdReportClass());
    learningTask.addReport(mlAdapter.getMetaCollectionReport());
    learningTask.addReport(BasicResultReport.class);
    learningTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, TcDeepLearningAdapter.PREPARATION_FOLDER);
    learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TRAINING_DATA);
    learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TEST_DATA);
    learningTask.addImport(embeddingTask, EmbeddingTask.OUTPUT_KEY, TcDeepLearningAdapter.EMBEDDING_FOLDER);
    learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TRAIN_OUTPUT);
    learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TRAIN);
    learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TEST_OUTPUT);
    learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TEST);
    // DKPro Lab issue 38: must be added as *first* task
    addTask(initTaskTrain);
    addTask(initTaskTest);
    addTask(preparationTask);
    addTask(embeddingTask);
    addTask(vectorizationTrainTask);
    addTask(vectorizationTestTask);
    addTask(learningTask);
}
Also used : PreparationTask(org.dkpro.tc.core.task.deep.PreparationTask) VectorizationTask(org.dkpro.tc.core.task.deep.VectorizationTask) EmbeddingTask(org.dkpro.tc.core.task.deep.EmbeddingTask) InitTaskDeep(org.dkpro.tc.core.task.deep.InitTaskDeep)

Aggregations

EmbeddingTask (org.dkpro.tc.core.task.deep.EmbeddingTask)2 InitTaskDeep (org.dkpro.tc.core.task.deep.InitTaskDeep)2 PreparationTask (org.dkpro.tc.core.task.deep.PreparationTask)2 VectorizationTask (org.dkpro.tc.core.task.deep.VectorizationTask)2 File (java.io.File)1 TaskContext (org.dkpro.lab.engine.TaskContext)1 ParameterSpace (org.dkpro.lab.task.ParameterSpace)1 DefaultBatchTask (org.dkpro.lab.task.impl.DefaultBatchTask)1 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)1