use of org.dkpro.tc.core.task.deep.InitTaskDeep in project dkpro-tc by dkpro.
the class DeepLearningExperimentCrossValidation method init.
/**
* Initializes the experiment. This is called automatically before execution. It's not done
* directly in the constructor, because we want to be able to use setters instead of the
* three-argument constructor.
*
* @throws IllegalStateException
* in case of errors
*/
protected void init() throws IllegalStateException {
if (experimentName == null) {
throw new IllegalStateException("You must set an experiment name");
}
if (numFolds < 2) {
throw new IllegalStateException("Number of folds is not configured correctly. Number of folds needs to be at " + "least 2 (but was " + numFolds + ")");
}
// initialize the setup
initTask = new InitTaskDeep();
initTask.setPreprocessing(getPreprocessing());
initTask.setOperativeViews(operativeViews);
initTask.setType(initTask.getType() + "-" + experimentName);
initTask.setAttribute(TC_TASK_TYPE, TcTaskType.INIT_TRAIN.toString());
// inner batch task (carried out numFolds times)
DefaultBatchTask crossValidationTask = new DefaultBatchTask() {
@Discriminator(name = DIM_FEATURE_MODE)
private String featureMode;
@Discriminator(name = DIM_CROSS_VALIDATION_MANUAL_FOLDS)
private boolean useCrossValidationManualFolds;
@Override
public void initialize(TaskContext aContext) {
super.initialize(aContext);
File xmiPathRoot = aContext.getFolder(InitTask.OUTPUT_KEY_TRAIN, AccessMode.READONLY);
Collection<File> files = FileUtils.listFiles(xmiPathRoot, new String[] { "bin" }, true);
String[] fileNames = new String[files.size()];
int i = 0;
for (File f : files) {
// adding file paths, not names
fileNames[i] = f.getAbsolutePath();
i++;
}
Arrays.sort(fileNames);
if (numFolds == LEAVE_ONE_OUT) {
numFolds = fileNames.length;
}
// manual mode is turned off
if (!useCrossValidationManualFolds && fileNames.length < numFolds) {
xmiPathRoot = createRequestedNumberOfCas(xmiPathRoot, fileNames.length, featureMode);
files = FileUtils.listFiles(xmiPathRoot, new String[] { "bin" }, true);
fileNames = new String[files.size()];
i = 0;
for (File f : files) {
// adding file paths, not names
fileNames[i] = f.getAbsolutePath();
i++;
}
}
// don't change any names!!
FoldDimensionBundle<String> foldDim = getFoldDim(fileNames);
Dimension<File> filesRootDim = Dimension.create(DIM_FILES_ROOT, xmiPathRoot);
ParameterSpace pSpace = new ParameterSpace(foldDim, filesRootDim);
setParameterSpace(pSpace);
}
/**
* creates required number of CAS
*
* @param xmiPathRoot
* input path
* @param numAvailableJCas
* all CAS
* @param featureMode
* the feature mode
* @return a file
*/
private File createRequestedNumberOfCas(File xmiPathRoot, int numAvailableJCas, String featureMode) {
try {
File outputFolder = FoldUtil.createMinimalSplit(xmiPathRoot.getAbsolutePath(), numFolds, numAvailableJCas, FM_SEQUENCE.equals(featureMode));
if (outputFolder == null) {
throw new NullPointerException("Output folder is null");
}
verfiyThatNeededNumberOfCasWasCreated(outputFolder);
return outputFolder;
} catch (Exception e) {
throw new IllegalStateException(e);
}
}
private void verfiyThatNeededNumberOfCasWasCreated(File outputFolder) {
int numCas = 0;
File[] listFiles = outputFolder.listFiles();
if (listFiles == null) {
throw new NullPointerException("Retrieving files in folder led to a NullPointer");
}
for (File f : listFiles) {
if (f.getName().contains(".bin")) {
numCas++;
}
}
if (numCas < numFolds) {
throw new IllegalStateException("Not enough TextClassificationUnits found to create at least [" + numFolds + "] folds");
}
}
};
// ================== SUBTASKS OF THE INNER BATCH TASK
// =======================
// collecting meta features only on the training data (numFolds times)
// get some meta data depending on the whole document collection
preparationTask = new PreparationTask();
preparationTask.setType(preparationTask.getType() + "-" + experimentName);
preparationTask.setMachineLearningAdapter(mlAdapter);
preparationTask.addImport(initTask, InitTask.OUTPUT_KEY_TRAIN, PreparationTask.INPUT_KEY_TRAIN);
preparationTask.setAttribute(TC_TASK_TYPE, TcTaskType.PREPARATION.toString());
embeddingTask = new EmbeddingTask();
embeddingTask.setType(embeddingTask.getType() + "-" + experimentName);
embeddingTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, EmbeddingTask.INPUT_MAPPING);
embeddingTask.setAttribute(TC_TASK_TYPE, TcTaskType.EMBEDDING.toString());
// feature extraction on training data
vectorizationTrainTask = new VectorizationTask();
vectorizationTrainTask.setType(vectorizationTrainTask.getType() + "-Train-" + experimentName);
vectorizationTrainTask.setTesting(false);
vectorizationTrainTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TRAIN.toString());
// feature extraction on test data
vectorizationTestTask = new VectorizationTask();
vectorizationTestTask.setType(vectorizationTestTask.getType() + "-Test-" + experimentName);
vectorizationTestTask.setTesting(true);
vectorizationTestTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TEST.toString());
// test task operating on the models of the feature extraction train and
// test tasks
learningTask = mlAdapter.getTestTask();
learningTask.setType(learningTask.getType() + "-" + experimentName);
learningTask.setAttribute(TC_TASK_TYPE, TcTaskType.MACHINE_LEARNING_ADAPTER.toString());
if (innerReports != null) {
for (Class<? extends Report> report : innerReports) {
learningTask.addReport(report);
}
}
// // always add OutcomeIdReport
learningTask.addReport(mlAdapter.getOutcomeIdReportClass());
learningTask.addReport(mlAdapter.getMajorityBaselineIdReportClass());
learningTask.addReport(mlAdapter.getRandomBaselineIdReportClass());
learningTask.addReport(mlAdapter.getMetaCollectionReport());
learningTask.addReport(BasicResultReport.class);
learningTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, TcDeepLearningAdapter.PREPARATION_FOLDER);
learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TRAINING_DATA);
learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TEST_DATA);
learningTask.addImport(embeddingTask, EmbeddingTask.OUTPUT_KEY, TcDeepLearningAdapter.EMBEDDING_FOLDER);
learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TRAIN_OUTPUT);
learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TRAIN);
learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TEST_OUTPUT);
learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TEST);
// ================== CONFIG OF THE INNER BATCH TASK
// =======================
crossValidationTask.addImport(initTask, InitTask.OUTPUT_KEY_TRAIN);
crossValidationTask.setType(crossValidationTask.getType() + "-" + experimentName);
crossValidationTask.addTask(preparationTask);
crossValidationTask.addTask(embeddingTask);
crossValidationTask.addTask(vectorizationTrainTask);
crossValidationTask.addTask(vectorizationTestTask);
crossValidationTask.addTask(learningTask);
crossValidationTask.setExecutionPolicy(ExecutionPolicy.USE_EXISTING);
// report of the inner batch task (sums up results for the folds)
// we want to re-use the old CV report, we need to collect the
// evaluation.bin files from
// the test task here (with another report)
crossValidationTask.addReport(DeepLearningInnerBatchReport.class);
crossValidationTask.setAttribute(TC_TASK_TYPE, TcTaskType.CROSS_VALIDATION.toString());
// DKPro Lab issue 38: must be added as *first* task
addTask(initTask);
addTask(crossValidationTask);
}
use of org.dkpro.tc.core.task.deep.InitTaskDeep in project dkpro-tc by dkpro.
the class DeepLearningExperimentTrainTest method init.
/**
* Initializes the experiment. This is called automatically before execution. It's not done
* directly in the constructor, because we want to be able to use setters instead of the
* arguments in the constructor.
*/
@Override
protected void init() {
if (experimentName == null) {
throw new IllegalStateException("You must set an experiment name");
}
// init the train part of the experiment
initTaskTrain = new InitTaskDeep();
initTaskTrain.setPreprocessing(getPreprocessing());
initTaskTrain.setOperativeViews(operativeViews);
initTaskTrain.setTesting(false);
initTaskTrain.setType(initTaskTrain.getType() + "-Train-" + experimentName);
initTaskTrain.setAttribute(TC_TASK_TYPE, TcTaskType.INIT_TRAIN.toString());
// init the test part of the experiment
initTaskTest = new InitTaskDeep();
initTaskTest.setTesting(true);
initTaskTest.setPreprocessing(getPreprocessing());
initTaskTest.setOperativeViews(operativeViews);
initTaskTest.setType(initTaskTest.getType() + "-Test-" + experimentName);
initTaskTest.setAttribute(TC_TASK_TYPE, TcTaskType.INIT_TEST.toString());
// get some meta data depending on the whole document collection
preparationTask = new PreparationTask();
preparationTask.setType(preparationTask.getType() + "-" + experimentName);
preparationTask.setMachineLearningAdapter(mlAdapter);
preparationTask.addImport(initTaskTrain, InitTask.OUTPUT_KEY_TRAIN, PreparationTask.INPUT_KEY_TRAIN);
preparationTask.addImport(initTaskTest, InitTask.OUTPUT_KEY_TEST, PreparationTask.INPUT_KEY_TEST);
preparationTask.setAttribute(TC_TASK_TYPE, TcTaskType.PREPARATION.toString());
embeddingTask = new EmbeddingTask();
embeddingTask.setType(embeddingTask.getType() + "-" + experimentName);
embeddingTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, EmbeddingTask.INPUT_MAPPING);
embeddingTask.setAttribute(TC_TASK_TYPE, TcTaskType.EMBEDDING.toString());
// feature extraction on training data
vectorizationTrainTask = new VectorizationTask();
vectorizationTrainTask.setType(vectorizationTrainTask.getType() + "-Train-" + experimentName);
vectorizationTrainTask.setTesting(false);
vectorizationTrainTask.addImport(initTaskTrain, InitTaskDeep.OUTPUT_KEY_TRAIN, VectorizationTask.DATA_INPUT_KEY);
vectorizationTrainTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TRAIN.toString());
// feature extraction on test data
vectorizationTestTask = new VectorizationTask();
vectorizationTestTask.setType(vectorizationTestTask.getType() + "-Test-" + experimentName);
vectorizationTestTask.setTesting(true);
vectorizationTestTask.addImport(initTaskTest, InitTaskDeep.OUTPUT_KEY_TEST, VectorizationTask.DATA_INPUT_KEY);
vectorizationTestTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, VectorizationTask.MAPPING_INPUT_KEY);
vectorizationTrainTask.setAttribute(TC_TASK_TYPE, TcTaskType.VECTORIZATION_TEST.toString());
learningTask = mlAdapter.getTestTask();
learningTask.setType(learningTask.getType() + "-" + experimentName);
learningTask.setAttribute(TC_TASK_TYPE, TcTaskType.MACHINE_LEARNING_ADAPTER.toString());
if (innerReports != null) {
for (Class<? extends Report> report : innerReports) {
learningTask.addReport(report);
}
}
// // always add OutcomeIdReport
learningTask.addReport(mlAdapter.getOutcomeIdReportClass());
learningTask.addReport(mlAdapter.getMajorityBaselineIdReportClass());
learningTask.addReport(mlAdapter.getRandomBaselineIdReportClass());
learningTask.addReport(mlAdapter.getMetaCollectionReport());
learningTask.addReport(BasicResultReport.class);
learningTask.addImport(preparationTask, PreparationTask.OUTPUT_KEY, TcDeepLearningAdapter.PREPARATION_FOLDER);
learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TRAINING_DATA);
learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, Constants.TEST_TASK_INPUT_KEY_TEST_DATA);
learningTask.addImport(embeddingTask, EmbeddingTask.OUTPUT_KEY, TcDeepLearningAdapter.EMBEDDING_FOLDER);
learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TRAIN_OUTPUT);
learningTask.addImport(vectorizationTrainTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TRAIN);
learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.VECTORIZIATION_TEST_OUTPUT);
learningTask.addImport(vectorizationTestTask, VectorizationTask.OUTPUT_KEY, TcDeepLearningAdapter.TARGET_ID_MAPPING_TEST);
// DKPro Lab issue 38: must be added as *first* task
addTask(initTaskTrain);
addTask(initTaskTest);
addTask(preparationTask);
addTask(embeddingTask);
addTask(vectorizationTrainTask);
addTask(vectorizationTestTask);
addTask(learningTask);
}
Aggregations