Commit d73c9123 by Ermal Toto

Initial commit

parents
File added
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/usr/bin/env python
# coding: utf-8
# In[17]:
# Feature Importance
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
import argparse
import pandas as pd # Pandas is used to import the CSV file
import numpy as np
import random
import analyzerTools as at
import operator
import json
import argparse
importancethreshold = 0.2
cached = True
dataStart = 5
dataEnd = -1
filename = "./FeatureFiles/DAIC/Labeled_NoMissing_originalOpenSmile.csv"
random.seed(123)
cutoff = 10
targetData = -1 #last feature used as class variable.
targetDataCount = 1
resampleType = "up" #up, down
svckernel = "linear" # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given: ‘rbf’
missingValues = -999 #Remove Instances: Remove, 0, -100, -1
modelTypes = ["RF","kNN"]
numNeighbors = 6 # k for kNN (number of neighbours)
gridParams = "None"
optimizeFor = "f1"
identifierList = '{"ParticipantID": 0}'
importanceRange = '{"Start": 0.1, "End": 4,"Increment":0.1}'
identifierList = json.loads(identifierList)
uniqueIdentifier = list(identifierList.keys())[0]
parser = argparse.ArgumentParser()
parser.add_argument("--importanceRange", type=float, help='Importance threshold as average multiplier range: default {"Start": 0.1, "End": 4,"Increment":0.1}')
parser.add_argument("--filename", help="Input File")
parser.add_argument("--foldcache", help="Fold Cache File")
parser.add_argument("--outputfolder", help="Output Folder")
args = parser.parse_args()
if args.__dict__["filename"] is not None:
filename = args.__dict__["filename"]
if args.__dict__["foldcache"] is not None:
foldcache = args.__dict__["foldcache"]
if args.__dict__["outputfolder"] is not None:
outputfolder = args.__dict__["outputfolder"]
if args.__dict__["importanceRange"] is not None:
importanceRange = args.__dict__["importanceRange"]
importanceRange = json.loads(importanceRange)
data = pd.read_csv(filename)
with open(foldcache) as json_file:
participantIdFolds = json.load(json_file)
print(importanceRange)
print(modelTypes)
for modelType in modelTypes:
for importancethreshold in np.arange(importanceRange["Start"],importanceRange["End"],importanceRange["Increment"]):
print("Generating Features for impartance: " + str(importancethreshold) + " model: " + modelType + " input " + filename + "output: " + outputfolder)
dataStart = 5
dataEnd = -1
data = pd.read_csv(filename)
featureSubset = data[data.columns[dataStart:dataEnd]] #Skip PHQ-9 Responses
target = data[data.columns[targetData]]
target = np.where(target > cutoff, 1, 0)
dataNames = data.columns.values
featureNames = featureSubset.columns.values
#print(np.mean(target))
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(featureSubset, target)
# display the relative importance of each attribute
importances = model.feature_importances_ #array with importances of each feature
idx = np.arange(0, featureSubset.shape[1]) #create an index array, with the number of features
print(np.mean(importances)*importancethreshold)
features_to_keep_indeces = idx[importances > np.mean(importances)*importancethreshold] #only keep features whose importance is greater than the mean importance
#should be about an array of size 3 (about)
features_to_keep_names = list(dataNames[range(0,dataStart)]) + list(featureNames[features_to_keep_indeces]) + list([dataNames[-1]])
data = data[features_to_keep_names]
resultsfile = outputfolder + "results_"+modelType+"_FeatureSelected_Labeled_NoMissing_"+str(importancethreshold)+"_"+str(data.shape[1])+".csv"
# In[18]:
if missingValues == '-999':
data = data.dropna()
else:
data = data.replace(np.nan, missingValues, regex=True) # Replace all missing values with missingValues as defined
#Add Binary Class
binary_classes = np.where(data[data.columns[targetData]] > cutoff, 1, 0)
data["binary_class"] = binary_classes
#Adujst for the addition of binary label
upToLabels = (targetDataCount*-1)-1
binaryLabel = targetData
numericLabel = targetData - 1
if resampleType != "None":
data = at.resampleDataset(data,resampleType,binaryLabel)
from sklearn import metrics
from sklearn import svm
#from statistics import mean
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
#from sklearn.svm import SVR
#Cross Validation will happen with predefined folds for replicability
#See script for generating folds
#CV is implemented from scratch instead of using the scikit
#Default CV to have more control of the underlying mechanism, as this might
#be required by metalgorithms such as SCB (Toto et al. 2018 ECML-PKDD)
master_y_pred = np.array(0)
master_y_score = np.array(0)
master_y_true = np.array(0)
firstFold = True
for fold in participantIdFolds:
trainids = participantIdFolds[fold]["TRAIN"]
testids = participantIdFolds[fold]["TEST"]
trainset = data[data[uniqueIdentifier].isin(trainids)]
testset = data[data[uniqueIdentifier].isin(testids)]
#SVC, RF, kNN
if modelType == "SVM":
clf = svm.SVC(kernel=svckernel)
regressor = svm.LinearSVR()
elif modelType == "RF":
n_estimators=30
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=10)
regressor = RandomForestRegressor(n_estimators=n_estimators)
elif modelType == "kNN":
clf = KNeighborsClassifier(n_neighbors=numNeighbors)
regressor = KNeighborsRegressor(n_neighbors=numNeighbors)
elif modelType == "GP":
kernel = 1.0 * RBF(1.0)
clf = GaussianProcessClassifier(kernel=kernel, random_state=0)
regressor = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0.1, normalize_y=True)
print(fold)
#Classification Model with Binary Labels and Predictions
clf.fit(trainset[trainset.columns[dataStart:upToLabels]], trainset[trainset.columns[binaryLabel]])
y_pred = clf.predict(testset[testset.columns[dataStart:upToLabels]])
y_score = clf.predict_proba(testset[testset.columns[dataStart:upToLabels]])
#Ground Truth for Classification
y_true = np.array(testset[testset.columns[binaryLabel]])
#Regression Model and Predictions
regressor.fit(trainset[trainset.columns[dataStart:upToLabels]], trainset[trainset.columns[numericLabel]])
y_pred_reg = regressor.predict(testset[testset.columns[dataStart:upToLabels]])
#Ground Truth for Regression
y_true_numeric = np.array(testset[testset.columns[numericLabel]])
#Combine Fold Results
if firstFold:
master_y_pred = y_pred
master_y_score = y_score
master_y_true = y_true
master_y_pred_reg = y_pred_reg
master_y_true_numeric = y_true_numeric
master_y_testids = testset[testset.columns[:dataStart]]
else:
master_y_pred = np.concatenate((master_y_pred,y_pred),axis=None)
master_y_score = np.concatenate((master_y_score,y_score),axis=0)
master_y_pred_reg = np.concatenate((master_y_pred_reg,y_pred_reg),axis=None)
master_y_true = np.concatenate((master_y_true,y_true),axis=None)
master_y_true_numeric = np.concatenate((master_y_true_numeric,y_true_numeric),axis=None)
master_y_testids = np.concatenate((master_y_testids,testset[testset.columns[:dataStart]]),axis=0)
firstFold = False
print(fold)
isId = True
for key, value in identifierList.items():
if isId:
finalResults = pd.DataFrame({key:master_y_testids[:,value].tolist()})
isId = False
else:
finalResults[key] = master_y_testids[:,value].tolist()
print("CHECK 1")
finalResults['binaryPrediction'] = master_y_pred.tolist()
finalResults['probabilityOfZero'] = master_y_score[:,0].tolist()
finalResults['probabilityOfOne'] = master_y_score[:,1].tolist()
finalResults['regressionPrediction'] = master_y_pred_reg.tolist()
finalResults['binaryTruth'] = master_y_true.tolist()
finalResults['regressionTruth'] = master_y_true_numeric.tolist()
finalResults.to_csv(resultsfile, sep=',')
print("CHECK 2")
# In[ ]:
#!/bin/bash
echo Adding labels using ${@: -1}
for filename in $*; do
if [ "$filename" != "${@: -1}" ]; then
echo $filename
awk -F"," -v OFS=',' 'NR==FNR{temp=$1;$1="";a[temp]=$0;next} a[$1]{print $0 a[$1]}' "${@: -1}" "$filename" > "Labeled_$filename"
fi
done
This diff is collapsed. Click to expand it.
#Author: Ermal Toto
# -*- coding: utf-8 -*-
import argparse
import random
import json
import analyzerTools as at
import numpy as np
import pandas as pd # Pandas is used to import the CSV file
import operator
#sklearn is used for ML algorithms
#from sklearn.model_selection import cross_validate
#from sklearn.model_selection import cross_val_predict
print("STARTING ANALYZER")
#Default Configuratin Values
random.seed(123)
cutoff = 10
targetData = -1 #last feature used as class variable.
targetDataCount = 1
resampleType = "down" #up, down
svckernel = "poly" # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given: ‘rbf’
missingValues = -999 #Remove Instances: Remove, 0, -100, -1
modelType = "RF" #SVC, RF, kNN
numNeighbors = 6 # k for kNN (number of neighbours)
gridParams = "None"
optimizeFor = "f1"
identifierList = '{"ParticipantID": 0, "QuestionNumber": 2,"Dataset":3,"Filename": 5}'
#-------------------------
#Begin Command Line Argument Parsing
parser = argparse.ArgumentParser()
parser.add_argument("--cutoff", type=int, help="Target Variable cutoff for binary classification")
parser.add_argument("--dataStart", type=int, help="Feature Subset Start Index")
parser.add_argument("--dataEnd", type=int, help="Feature Subset End Index")
parser.add_argument("--filename", help="Input filename, default: mtrComplete")
parser.add_argument("--resultsfile", help="Results output filename")
parser.add_argument("--resampleType", help="Resample Type, default: downsample", choices=['up','down','None'])
parser.add_argument("--svckernel", help="Support Vector Classifier Kernel, default: rbf", choices=['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'])
parser.add_argument("--modelType", help="Classifier Type, default: RF", choices=['SVM', 'RF', 'kNN','GP'])
parser.add_argument("--numNeighbors", type=int, help="Number of neighbours for kNN, default: 3")
parser.add_argument("--targetDataCount", type=int, help="Number of features that are part of the target class, default is 1")
parser.add_argument("--targetData", type=int, help="Location of target data. Default -1. This should always be negative")
parser.add_argument("--missingValues", type=int, help="what to replace missing values with -999 will remove them")
parser.add_argument("--gridParams", help="Parameter Scope for ML Models")
parser.add_argument("--optimizeFor", help="Optimization Target, Default: f1", choices=['precision', 'recall', 'f1','accuracy'])
parser.add_argument("--foldcache", help="Fold Cache File. Default=foldchache.json")
parser.add_argument("--identifierList", help='Json list of Identifiers and indeces: Default: {"ParticipantID": 0, "QuestionNumber": 2,"Dataset":3,"Filename": 5}. The first should allways be the unique scope identifier (i.e: between participants, or between questions)')
args = parser.parse_args()
if args.__dict__["missingValues"] is not None:
missingValues = args.__dict__["missingValues"]
if args.__dict__["targetData"] is not None:
targetData = args.__dict__["targetData"]
if args.__dict__["targetDataCount"] is not None:
targetDataCount = args.__dict__["targetDataCount"]
if args.__dict__["cutoff"] is not None:
cutoff = args.__dict__["cutoff"]
if args.__dict__["dataStart"] is not None:
dataStart = args.__dict__["dataStart"]
if args.__dict__["dataEnd"] is not None:
dataEnd = args.__dict__["dataEnd"]
if args.__dict__["filename"] is not None:
filename = args.__dict__["filename"]
if args.__dict__["resultsfile"] is not None:
resultsfile = args.__dict__["resultsfile"]
if args.__dict__["resampleType"] is not None:
resampleType = args.__dict__["resampleType"]
if args.__dict__["svckernel"] is not None:
svckernel = args.__dict__["svckernel"]
if args.__dict__["modelType"] is not None:
modelType = args.__dict__["modelType"]
if args.__dict__["numNeighbors"] is not None:
numNeighbors = args.__dict__["numNeighbors"]
if args.__dict__["gridParams"] is not None:
gridParams = args.__dict__["gridParams"]
if args.__dict__["optimizeFor"] is not None:
optimizeFor = args.__dict__["optimizeFor"]
if args.__dict__["foldcache"] is not None:
foldcache = args.__dict__["foldcache"]
with open(foldcache) as json_file:
participantIdFolds = json.load(json_file)
if args.__dict__["identifierList"] is not None:
identifierList = args.__dict__["identifierList"]
import os.path
if os.path.isfile(resultsfile):
print ("Results File: " + resultsfile +" does exist")
exit()
else:
print ("Results File: " + resultsfile +" does not exist")
identifierList = json.loads(identifierList)
uniqueIdentifier = list(identifierList.keys())[0]
#End Command Line Argument Parsing
#Open Data File
data = pd.read_csv(filename)
if missingValues == '-999':
data = data.dropna()
else:
data = data.replace(np.nan, missingValues, regex=True) # Replace all missing values with missingValues as defined
#Add Binary Class
binary_classes = np.where(data[data.columns[targetData]] > cutoff, 1, 0)
data["binary_class"] = binary_classes
#Adujst for the addition of binary label
upToLabels = (targetDataCount*-1)-1
binaryLabel = targetData
numericLabel = targetData - 1
if resampleType != "None":
data = at.resampleDataset(data,resampleType,binaryLabel)
from sklearn import metrics
from sklearn import svm
#from statistics import mean
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
#from sklearn.svm import SVR
#Cross Validation will happen with predefined folds for replicability
#See script for generating folds
#CV is implemented from scratch instead of using the scikit
#Default CV to have more control of the underlying mechanism, as this might
#be required by metalgorithms such as SCB (Toto et al. 2018 ECML-PKDD)
master_y_pred = np.array(0)
master_y_score = np.array(0)
master_y_true = np.array(0)
firstFold = True
for fold in participantIdFolds:
trainids = participantIdFolds[fold]["TRAIN"]
testids = participantIdFolds[fold]["TEST"]
trainset = data[data[uniqueIdentifier].isin(trainids)]
testset = data[data[uniqueIdentifier].isin(testids)]
#SVC, RF, kNN
if modelType == "SVM":
clf = svm.SVC(kernel=svckernel, probability=True)
regressor = svm.SVR(kernel=svckernel)
elif modelType == "RF":
n_estimators=30
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=10)
regressor = RandomForestRegressor(n_estimators=n_estimators)
elif modelType == "kNN":
clf = KNeighborsClassifier(n_neighbors=numNeighbors)
regressor = KNeighborsRegressor(n_neighbors=numNeighbors)
elif modelType == "GP":
kernel = 1.0 * RBF(1.0)
clf = GaussianProcessClassifier(kernel=kernel, random_state=0)
regressor = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0.1, normalize_y=True)
#Classification Model with Binary Labels and Predictions
clf.fit(trainset[trainset.columns[dataStart:upToLabels]], trainset[trainset.columns[binaryLabel]])
y_pred = clf.predict(testset[testset.columns[dataStart:upToLabels]])
y_score = clf.predict_proba(testset[testset.columns[dataStart:upToLabels]])
#Ground Truth for Classification
y_true = np.array(testset[testset.columns[binaryLabel]])
#Regression Model and Predictions
regressor.fit(trainset[trainset.columns[dataStart:upToLabels]], trainset[trainset.columns[numericLabel]])
y_pred_reg = regressor.predict(testset[testset.columns[dataStart:upToLabels]])
#Ground Truth for Regression
y_true_numeric = np.array(testset[testset.columns[numericLabel]])
#Combine Fold Results
if firstFold:
master_y_pred = y_pred
master_y_score = y_score
master_y_true = y_true
master_y_pred_reg = y_pred_reg
master_y_true_numeric = y_true_numeric
master_y_testids = testset[testset.columns[:dataStart]]
else:
master_y_pred = np.concatenate((master_y_pred,y_pred),axis=None)
master_y_score = np.concatenate((master_y_score,y_score),axis=0)
master_y_pred_reg = np.concatenate((master_y_pred_reg,y_pred_reg),axis=None)
master_y_true = np.concatenate((master_y_true,y_true),axis=None)
master_y_true_numeric = np.concatenate((master_y_true_numeric,y_true_numeric),axis=None)
master_y_testids = np.concatenate((master_y_testids,testset[testset.columns[:dataStart]]),axis=0)
firstFold = False
isId = True
for key, value in identifierList.items():
if isId:
finalResults = pd.DataFrame({key:master_y_testids[:,value].tolist()})
isId = False
else:
finalResults[key] = master_y_testids[:,value].tolist()
finalResults['binaryPrediction'] = master_y_pred.tolist()
finalResults['probabilityOfZero'] = master_y_score[:,0].tolist()
finalResults['probabilityOfOne'] = master_y_score[:,1].tolist()
finalResults['regressionPrediction'] = master_y_pred_reg.tolist()
finalResults['binaryTruth'] = master_y_true.tolist()
finalResults['regressionTruth'] = master_y_true_numeric.tolist()
finalResults.to_csv(resultsfile, sep=',')
from sklearn.utils import resample
import collections
import pandas as pd # Pandas is used to import the CSV file
def resampleDataset(dataset,resampleType,targetData):
#Identify majority and miniority class for upsample/downsample proceedures
targetClassCount = collections.Counter(dataset[dataset.columns[targetData]])
majorityKey = max(targetClassCount, key=targetClassCount.get)
majorityCount = targetClassCount[majorityKey]
minorityKey = min(targetClassCount, key=targetClassCount.get)
minorityCount = targetClassCount[minorityKey]
#Separate minority and majority classes
dataset_majority = dataset[dataset[dataset.columns[len(dataset.columns)-1]] == majorityKey]
dataset_minority = dataset[dataset[dataset.columns[len(dataset.columns)-1]] == minorityKey]
if resampleType == "up":
# Upsample minority class with replacement
dataset_minority_upsampled = resample(dataset_minority,replace=True,n_samples=majorityCount)
# Combine majority class with upsampled minority class
dataset_upsampled = pd.concat([dataset_majority, dataset_minority_upsampled])
dataset_upsampled = dataset_upsampled.sample(frac=1).reset_index(drop=True)
dataset = dataset_upsampled
elif resampleType == "down":
# Downsample majority class without replacement
dataset_majority_downsampled = resample(dataset_majority,replace=False,n_samples=minorityCount)
# Combine majority class with upsampled minority class
dataset_downsampled = pd.concat([dataset_majority_downsampled, dataset_minority])
dataset_downsampled = dataset_downsampled.sample(frac=1).reset_index(drop=True)
dataset = dataset_downsampled
return dataset
\ No newline at end of file
#!/bin/bash
for filename in $*; do
echo $filename
awk -F, '$20==""' $filename | wc
done
{
"1": {
"TRAIN": [
8801,
1301,
5039,
3702,
8889,
8366,
8423,
3113,
9550,
5683,
7984,
2111,
2806,
1134,
5364,
8760,
4186,
3733,
6990,
4356,
2819,
2605,
970,
1779,
4894,
249,
1526,
9513,
5547,
4362,
6945,
9766,
2373,
6345,
8917,
8685,
6405,
1043,
3036,
18
],
"TEST": [
6145,
4690,
9871,
9890,
7544,
8326,
1175,
5111,
5631,
2828,
122,
3893,
7203,
7683,
2978,
9137,
9170,
1107,
1367,
9387
]
},
"2": {
"TRAIN": [
6145,
8801,
1301,
4690,
9871,
9890,
7544,
5039,
8326,
8889,
1175,
5111,
8423,
5631,
5683,
2111,
2806,
1134,
5364,
4356,
2828,
122,
4894,
3893,
1526,
9513,
7203,
4362,
9766,
7683,
2978,
2373,
9137,
6345,
9170,
8917,
1107,
1367,
9387,
18
],
"TEST": [
3702,
8366,
3113,
9550,
7984,
8760,
4186,
3733,
6990,
2819,
2605,
970,
1779,
249,
5547,
6945,
8685,
6405,
1043,
3036
]
},
"3": {
"TRAIN": [
6145,
4690,
9871,
9890,
7544,
8326,
3702,
8366,
1175,
5111,
3113,
9550,
5631,
7984,
8760,
4186,
3733,
6990,
2819,
2605,
2828,
970,
122,
1779,
3893,
249,
7203,
5547,
6945,
7683,
2978,
9137,
9170,
1107,
8685,
1367,
6405,
1043,
9387,
3036
],
"TEST": [
8801,
1301,
5039,
8889,
8423,
5683,
2111,
2806,
1134,
5364,
4356,
4894,
1526,
9513,
4362,
9766,
2373,
6345,
8917,
18
]
}
}
\ No newline at end of file
{
"1": {
"TRAIN": [
18,
122,
1107,
1301,
1779,
2373,
2605,
2806,
3113,
3893,
4186,
4690,
5039,
5111,
5547,
5631,
5683,
6945,
6990,
7203,
7683,
7984,
8366,
8423,
8685,
8760,
8801,
8917,
9137,
9170,
9766,
9890
],
"TEST": [
1134,
1175,