Commit a81878c9 by simonabottani

Add abstract class to read tsv files for ML

parent b3616f27
Pipeline #973 passed with stages
in 2 minutes 17 seconds
......@@ -11,6 +11,7 @@ from clinica.pipelines.machine_learning import base
import clinica.pipelines.machine_learning.voxel_based_io as vbio
import clinica.pipelines.machine_learning.vertex_based_io as vtxbio
import clinica.pipelines.machine_learning.region_based_io as rbio
import clinica.pipelines.machin_learning.tsv_based_io as tbio
import clinica.pipelines.machine_learning.svm_utils as utils
......@@ -394,3 +395,80 @@ class CAPSVertexBasedInput(CAPSInput):
def save_weights_as_nifti(self, weights, output_dir):
pass
class CAPSTSVBasedInput(CAPSInput):
def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset,
pvc=None, precomputed_kernel=None):
"""
Args:
caps_directory:
subjects_visits_tsv:
diagnoses_tsv:
group_id:
image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute'
atlas:
precomputed_kernel:
"""
super(CAPSTSVBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id,
image_type, dataset, precomputed_kernel=precomputed_kernel)
self._atlas = atlas
self._pvc = pvc
self._dataset = dataset
self._orig_shape = None
self._data_mask = None
if atlas not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']:
raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers' ")
def get_images(self):
"""
Returns: string
"""
import pandas as pd
if self._images is not None:
return self._images
if self._image_type == 'T1':
self._images = str('group-' + self._group_id + '_T1w_space-' + self._atlas + '_map-graymatter')
## to implement for PET
return self._images
def get_x(self):
"""
Returns: a numpy 2d-array.
"""
if self._x is not None:
return self._x
print 'Loading TSV subjects'
self._x = tbio.load_data(self._images, self._caps_directory, self._subjects_visits_tsv, self._dataset)
###to finish
print 'Subjects loaded'
return self._x
def save_weights_as_nifti(self, weights, output_dir):
"""
Args:
weights:
output_dir:
Returns:
"""
output_filename = path.join(output_dir, 'weights.nii.gz')
rbio.weights_to_nifti(weights, self._atlas, output_filename)
\ No newline at end of file
......@@ -554,3 +554,105 @@ class RB_RepKFold_DualSVM(base.MLWorkflow):
self._validation.save_results(self._output_dir)
self._input.save_weights_as_nifti(weights, classifier_dir)
class TB_RepHoldOut_DualSVM(base.MLWorkflow):
def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset,
output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3,
grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None):
self._output_dir = output_dir
self._n_threads = n_threads
self._n_iterations = n_iterations
self._test_size = test_size
self._grid_search_folds = grid_search_folds
self._balanced = balanced
self._c_range = c_range
self._splits_indices = splits_indices
self._input = input.CAPSTSVBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id,
image_type, atlas, dataset, pvc)
self._validation = None
self._algorithm = None
def run(self):
x = self._input.get_x()
y = self._input.get_y()
kernel = self._input.get_kernel()
self._algorithm = algorithm.DualSVMAlgorithm(kernel,
y,
balanced=self._balanced,
grid_search_folds=self._grid_search_folds,
c_range=self._c_range,
n_threads=self._n_threads)
self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size)
classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices)
classifier_dir = path.join(self._output_dir, 'classifier')
if not path.exists(classifier_dir):
os.makedirs(classifier_dir)
self._algorithm.save_classifier(classifier, classifier_dir)
self._algorithm.save_parameters(best_params, classifier_dir)
weights = self._algorithm.save_weights(classifier, x, classifier_dir)
self._input.save_weights_as_nifti(weights, classifier_dir)
self._validation.save_results(self._output_dir)
class TB_RepHoldOut_RandomForest(base.MLWorkflow):
def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset,
output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3,
grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400),
max_depth_range=[None], min_samples_split_range=[2],
max_features_range=('auto', 0.25, 0.5), splits_indices=None):
self._output_dir = output_dir
self._n_threads = n_threads
self._n_iterations = n_iterations
self._test_size = test_size
self._grid_search_folds = grid_search_folds
self._balanced = balanced
self._n_estimators_range = n_estimators_range
self._max_depth_range = max_depth_range
self._min_samples_split_range = min_samples_split_range
self._max_features_range = max_features_range
self._splits_indices = splits_indices
self._input = input.CAPSTSVBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id,
image_type, atlas, dataset, pvc)
self._validation = None
self._algorithm = None
def run(self):
x = self._input.get_x()
y = self._input.get_y()
self._algorithm = algorithm.RandomForest(x, y, balanced=self._balanced,
grid_search_folds=self._grid_search_folds,
n_estimators_range=self._n_estimators_range,
max_depth_range=self._max_depth_range,
min_samples_split_range=self._min_samples_split_range,
max_features_range=self._max_features_range,
n_threads=self._n_threads)
self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size)
classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices)
classifier_dir = os.path.join(self._output_dir, 'classifier')
if not path.exists(classifier_dir):
os.makedirs(classifier_dir)
self._algorithm.save_classifier(classifier, classifier_dir)
self._algorithm.save_parameters(best_params, classifier_dir)
weights = self._algorithm.save_weights(classifier, classifier_dir)
self._input.save_weights_as_nifti(weights, classifier_dir)
self._validation.save_results(self._output_dir)
# coding: utf8
import numpy as np
import pandas as pd
import nibabel as nib
import os
__author__ = "Simona Bottani"
__copyright__ = "Copyright 2016-2018, The Aramis Lab Team"
__credits__ = ["Simona Bottani"]
__license__ = "See LICENSE.txt file"
__version__ = "0.1.0"
__maintainer__ = "Arnaud Marcoux"
__email__ = "simona.bottani@icm-institute.com"
__status__ = "Development"
def load_data(images, caps_directory, subjects_visits_tsv, dataset):
"""
Args:
images:
caps_directory:
subjects_visits_tsv:
dataset:
Returns:
np 2D array
"""
df = pd.io.parsers.read_csv(os.path.join(caps_directory), sep='\t')
all_vector = np.array([])
if dataset == 'OASIS':
df = df[df.age_bl > 61]
subjects_visits = pd.io.parsers.read_csv(os.path.join(subjects_visits_tsv), sep='\t')
participant_id = subjects_visits.participant_id.values
session_id = subjects_visits.session_id.values
for i in xrange(len(participant_id)):
df_sub = df[df.participant_id == participant_id[i]]
df_analysis = df_sub[[col for col in df_sub.columns if images in col]]
all_vector = np.append(all_vector, df_analysis.values)
data = np.zeros((participant_id.shape[0], df_analysis.shape[1]))
data_temp = np.split(all_vector, participant_id.shape[0])
for i in xrange(len(participant_id)):
for j in xrange(df_analysis.shape[1]):
data[i][j] = data_temp[i][j]
return data
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment