"""
Test2: data and KSC appliaction script generation
==================================================
This script is for generating the data set and KSC application shell scripts
for :ref:`Test2 <sec_test2>`.
Data generation
----------------
Data are generated from isotropic Gaussian distributions with constant standard
deviation. The number of cluster centers, their minimum separation (within a
given box), the number of features (dimensions) can be specified as input
arguments. The number of generated samples as well as the training and validaiton
sub-sample sizes might be changed as well. The data generation related input
parameters are listed in the :numref:`table1_kscICholTest2`.
.. _table1_kscICholTest2:
.. table:: Data generation related input parameters, their configuration flags
and default values.
=========== ============== ==================
Flag Default Description
=========== ============== ==================
**-c** 8 number of cluster centers
**-f** 2 number of features
**-n** 100 000 number of samples
**-t** 20 000 number of *training* sub-samples
**-v** 10 000 number of *validation* sub-samples
**-d** 5 minimum distance between cluster centers
**-s** 0.8 standard deviation of the underlying Gaussians
=========== ============== ==================
.. note:: feature values are generated such that they are in the [-20:20] box/range
(before the standardisation). At initialisation, the required number of cluster
centers are generated (uniformly random) in the (hyper) box, such that the
generated cluster centers has the required minimum distance from each other.
When after a given number of trial (1000) this condition cannot be fullfilled,
i.e. random cluster centers, with the given minimum distance, could not be
generated, the script is interapted with and error message. You need to change
the minimum distance or number of cluster ceter parameters in this case.
Shell script generation
------------------------
After generating and writing the data set in files (including sub-samples for
training and validation), the shell scripts will be generated for the KSC
applicaitons provided for *training*, hyper parameter *tuning* and out-of-sample
extension i.e. *testing*.
Note, that since the data set is generated by the same script, number of training
and validation set size, their location, number of features, etc ... are known.
Moreover, the usually unknown exact number of cluster center hyper parameter value
is also well known in this case. The shell scripts, generated for the KSC
applications will contain these known parameter values.
Moreover, some initial values for the RBF kernel bandwidth hyper parameter values
will be estimated by the script (both for the IDC and KSC parts) using the
:ref:`estimateBW.py <estimatebw_doc>` script provided in the :ref:`Utils <sec_utils>`.
However, these estimates are often inaccurate and leads to very poor results.
An input parameter is available to scale these initial estimates to find the
appropriate values that is listed in :numref:`table2_kscICholTest2`
(see :ref:`Test2 <sec_test2>` for more details).
.. _table2_kscICholTest2:
.. table:: KSC shell script generation related input parameters, their
configuration flags and default values.
=========== ===================== ==================
Flag Default Description
=========== ===================== ==================
**-x** :math:`0.4\sqrt{N}` scaling factor for the bandwidth estimation
**-e** 0.85 level of tolerated error in the inc. Cholesky approximation
**-r** 300 maximum allowed rank of the inc. Cholesky approximation
**-m** 'AMS' cluster memberships encoding-decoding {'AMS','BAS','BLF'}
=========== ===================== ==================
Examples
---------
This example shows how to execute the data and the corresponding initial
KSC shell script generation with the default parameters. Note, that this will
print the available parameters, their default value and the corresponding flags
to change them ::
bash-3.2$ python kscICholTest2.py
==== (Python) === : Parameters (-flag to change) ...
----------------------------------------------------------
--- Number of cluster centers (-c) : 8
--- Number of features (-f) : 2
--- Number of smaples (-n) : 1.000e+05
--- Training sub-smaples (-t) : 2.000e+04
--- Validation sub-smaples (-v) : 1.000e+04
--- Min. cluster center distance (-d) : 5.000e+00
--- STDV of the underlying Gaussians (-s) : 8.000e-01
--- Scaling of the kernel BW estimate (-x) : 5.657e+01
--- Inc. Cholesky error tolerance (-e) : 8.500e-01
--- Inc. Cholesky maxium rank (-r) : 3.000e+02
--- KSC membership encoding-decoding (-m) : AMS
----------------------------------------------------------
==== (Python) === : Generating data ...
==== (Python) === : Generating (initial) script for hyper-parameter tuning ...
==== (Python) === : Generating (initial) script for training ...
==== (Python) === : Generating (initial) script for testing ...
"""
import os
import sys
import getopt
import numpy as np
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from scipy.spatial import distance
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
sys.path.append('../utils/')
import estimateBW
##
## Number of clusters and features of the generated data
## -----------------------------------------------------------------------------
gNumClusters = 8 #
gNumFeatures = 2 #
## -----------------------------------------------------------------------------
##
## Size of the generated data set with the training/validation sub-sample sizes.
## -----------------------------------------------------------------------------
gNumSamples = 100000 # total number of samples generated
gNumTrain = 20000 # sub-sample size for training
gNumValid = 10000 # sub-sample size for validation
## -----------------------------------------------------------------------------
##
## Cluster separation related valriables (we will change the standard-dev only)
## -----------------------------------------------------------------------------
gClustersBox = 20.0 # feature value rang is [-gClustersBox:+gClustersBox]
gClusterDist = 5.0 # minimum separation between each cluster center
gClusterSTDV = 0.8 # standard deviation of the clusters
## -----------------------------------------------------------------------------
##
## Kernel bandwidth estimation options (we will use a scalled ISJ)
## -----------------------------------------------------------------------------
#gTypeBW = 'Silverman'
#gScaleBW = 0.05
## .............................................................................
gTypeBW = 'ISJ'
gScaleBW = 0.4*np.sqrt(gNumTrain)
## -----------------------------------------------------------------------------
##
## Some of the initial parameter values emitted to the KSC appliaction scripts
## -----------------------------------------------------------------------------
gICholMaxRank = 300 # max rank parameter for the KSC - ICD part
gIcholErrTol = 0.85 # the tolerated error level in the inc. Cholesky part
gClEncodScheme = 'AMS' # cluster membership encoding-decoding {"AMS","BAS","BLF"}
## -----------------------------------------------------------------------------
##
## Path to the KscIchol_Tune, KscIchol_Train and KscIchol_Test executables
## -----------------------------------------------------------------------------
gPathTo_kscIchol = '../../bin/'
## -----------------------------------------------------------------------------
##
## Directories to store the generated data and the output of the KSC applicatons
## -----------------------------------------------------------------------------
gOutDataPath = 'data'
gOutWorkPath = 'out'
## -----------------------------------------------------------------------------
##
## Generate data file names (for all data, their labels, training and validation)
## -----------------------------------------------------------------------------
gNameDat = ''.join([gOutDataPath,'/data','.dat'])
gNameLab = ''.join([gOutDataPath,'/data_','Labels.dat'])
gNameDatTrain = ''.join([gOutDataPath,'/data_','Train_N',str(gNumTrain),'.dat'])
gNameDatValid = ''.join([gOutDataPath,'/data_','Valid_N',str(gNumValid),'.dat'])
## -----------------------------------------------------------------------------
# fix the random number generator seed (for reproducibility)
np.random.seed(123)
## prints (some) parameters before data and script generation
[docs]def PrintParam():
print (" ==== (Python) === : Parameters (-flag to change) ...")
print (" ---------------------------------------------------------- ")
print (" --- Number of cluster centers (-c) : {:>12d}".format(gNumClusters))
print (" --- Number of features (-f) : {:>12d}".format(gNumFeatures))
print (" --- Number of smaples (-n) : {:>12.3e}".format(gNumSamples))
print (" --- Training sub-smaples (-t) : {:>12.3e}".format(gNumTrain))
print (" --- Validation sub-smaples (-v) : {:>12.3e}".format(gNumValid))
print (" --- Min. cluster center distance (-d) : {:>12.3e}".format(gClusterDist))
print (" --- STDV of the underlying Gaussians (-s) : {:>12.3e}".format(gClusterSTDV))
print (" --- Scaling of the kernel BW estimate (-x) : {:>12.3e}".format(gScaleBW))
print (" --- Inc. Cholesky error tolerance (-e) : {:>12.3e}".format(gIcholErrTol))
print (" --- Inc. Cholesky maxium rank (-r) : {:>12.3e}".format(gICholMaxRank))
print (" --- KSC membership encoding-decoding (-m) : {:>12}".format(gClEncodScheme))
print (" ---------------------------------------------------------- ")
################################################################################
[docs]def GenerateData():
print(" ==== (Python) === : Generating data ...")
os.system('rm -f -r data')
os.system('mkdir data')
# generate NClusters, NFeatures-dimensional cluster centers such that
# their feature values are in the [-ClustersBox,+ClustersBox] interval and
# the picewise distance between the cluster centers is at least ClusterDist
# Note: if cluster centers with the given parameters cannot be generated
# (i.e. with the given minimum distance between the centers)
# within 100 trials and error message will be given. Paramaters, such
# as ClusterSTD, ClusterDist, ClustersBox or even NClusters, NFeatures
# must be adjusted to have more space between the clusters.
count = 0
while True:
centers = (2.0*np.random.rand(gNumClusters, gNumFeatures)-1.0)*gClustersBox
dist = distance.cdist(centers, centers, 'euclidean')
if (np.where(dist>0., dist, 2.0*gClusterDist)).min() > gClusterDist or count>1000:
break;
else:
count += 1
## check if centers could be generated
if count>1000:
print("FAILD to generate centers with the given condistions")
exit()
## generated data
xData, yData = sklearn.datasets.make_blobs(n_samples = gNumSamples,
n_features = gNumFeatures,
centers = centers,
cluster_std = gClusterSTDV,
center_box = (-gClustersBox,gClustersBox))
## shuffle to get random examples (store the state and set back)
#np.random.seed(12345678)
rngSt0 = np.random.get_state()
np.random.shuffle(xData)
np.random.set_state(rngSt0)
np.random.shuffle(yData)
## standardise the data
xData = StandardScaler().fit_transform(xData)
## save data to files
np.savetxt(gNameDat , xData , fmt='%14.6e')
np.savetxt(gNameLab , yData , fmt='%d' )
np.savetxt(gNameDatTrain, xData[0:gNumTrain,] , fmt='%14.6e')
np.savetxt(gNameDatValid, xData[gNumTrain:gNumTrain+gNumValid,], fmt='%14.6e')
################################################################################
[docs]def GenerateScriptTraining():
print(" ==== (Python) === : Generating (initial) script for training ...")
os.system('rm -f -r out')
os.system('mkdir out')
##
## The dictionary for generating the kscIchol_Train input parameters
script_kscIchol_Train = {}
##
## the main program:
script_kscIchol_Train['main'] = gPathTo_kscIchol+'KscIchol_Train'
##
## the incomplete Cholesky decomposition related:
script_kscIchol_Train['--icholTolError'] = gIcholErrTol
script_kscIchol_Train['--icholMaxRank'] = gICholMaxRank
## this will be estimated below (using Silverman's rule of thumb or ISJ) !!!
script_kscIchol_Train['--icholRBFKernelPar'] = 0.1
##
script_kscIchol_Train['--icholRedSetFile'] = gOutWorkPath+'/ReducedSetData.dat'
script_kscIchol_Train['--icholPermVectFile'] = gOutWorkPath+'/PermutationVector.dat'
##
## the training data set related:
script_kscIchol_Train['--trDataNumber'] = gNumTrain
script_kscIchol_Train['--trDataDimension'] = gNumFeatures
script_kscIchol_Train['--trDataFile'] = gNameDatTrain
##
## the clustering related:
script_kscIchol_Train['--clNumber'] = gNumClusters
## this will be estimated below (using Silverman's rule of thumb or ISJ) !!!
script_kscIchol_Train['--clRBFKernelPar'] = 0.1
script_kscIchol_Train['--clEncodingScheme'] = gClEncodScheme
script_kscIchol_Train['--clEvalOutlierThrs'] = 5
script_kscIchol_Train['--clEvalWBalance'] = 0.5
script_kscIchol_Train['--clLevel'] = 1
script_kscIchol_Train['--clResFile'] = gOutWorkPath+'/CRes.dat'
script_kscIchol_Train['--clResDataFile'] = gOutWorkPath+'/CData.dat'
##
## other, optional
script_kscIchol_Train['--verbosityLevel'] = 2
script_kscIchol_Train['--numBLASThreads'] = 4
##
## estiamte optimal kernel bandwidth (using Silverman's rule of thumb or ISJ)
## (see more at ../utils/estimateBW.py):
## load training data, execute estimation, replace the parameters in dict
trDataF = script_kscIchol_Train['--trDataFile']
trData = np.loadtxt(trDataF, dtype=float)
#estedBW = gScaleBW*estimateBW.EstimateBW(trData, 'Silverman')
estedBW = gScaleBW*estimateBW.EstimateBW(trData, gTypeBW)
strBW = ",".join([np.format_float_scientific(h,precision=4) for h in estedBW])
## set the gNumFeatures dimensional estimated b.w. for the Inc-Col. part
script_kscIchol_Train['--icholRBFKernelPar'] = strBW
## set the mean (over the gNumFeatures dimensions) for the clustering (i.e. 1D)
script_kscIchol_Train['--clRBFKernelPar'] = np.mean(estedBW)
##
## print
runMe = script_kscIchol_Train['main'] + ' \\\n'
cntr = 1
for cmd, par in script_kscIchol_Train.items():
if cmd != 'main':
if cntr!=len(script_kscIchol_Train)-1:
runMe += ''.join([str(cmd), " ", str(par), ' \\\n'])
else:
runMe += ''.join([str(cmd), " ", str(par), ' \n'])
cntr += 1
## save the script to a .sh file
f = open("runKSCIchol_Train.sh","w")
f.write(runMe)
f.close()
##
## print(" ==== (Python) === : Executing training the KSC-IChol model on the training set...")
## os.system(runMe)
################################################################################
[docs]def GenerateScriptTuning():
print(" ==== (Python) === : Generating (initial) script for hyper-parameter tuning ...")
os.system('rm -f -r out')
os.system('mkdir out')
##
## The dictionary for generating the kscIchol_Tune input parameters
script_kscIchol_Tune = {}
##
## the main program:
script_kscIchol_Tune['main'] = gPathTo_kscIchol+'KscIchol_Tune'
##
## the incomplete Cholesky decomposition related:
script_kscIchol_Tune['--icholTolError'] = gIcholErrTol
script_kscIchol_Tune['--icholMaxRank'] = gICholMaxRank
## this will be estimated below using Silverman's rule of thumb !!!
script_kscIchol_Tune['--icholRBFKernelPar'] = 1.0
##
## the training data set related:
script_kscIchol_Tune['--trDataNumber'] = gNumTrain
script_kscIchol_Tune['--trDataDimension'] = gNumFeatures
script_kscIchol_Tune['--trDataFile'] = gNameDatTrain
##
## the validation data set related:
script_kscIchol_Tune['--valDataNumber'] = gNumValid
script_kscIchol_Tune['--valDataFile'] = gNameDatValid
##
## the clustering related:
## min/max number of clusters to try: each integer value will be tested between
## NOTE: the max is set now to 2x the real number of clusters (that is known now)
script_kscIchol_Tune['--minClusterNumber'] = 3
script_kscIchol_Tune['--maxClusterNumber'] = 2*gNumClusters
## kernel parameter values to try: will be set below !!!
script_kscIchol_Tune['--kernelParameters'] = 1.0
script_kscIchol_Tune['--clEvalOutlierThrs'] = 5
script_kscIchol_Tune['--clEncodingScheme'] = gClEncodScheme
script_kscIchol_Tune['--clEvalWBalance'] = 0.5
##
## other, optional
script_kscIchol_Tune['--verbosityLevel'] = 2
script_kscIchol_Tune['--numBLASThreads'] = 4
script_kscIchol_Tune['--resFile'] = gOutWorkPath+'/TuningRes.dat'
##
## estiamte optimal kernel bandwidth using Silverman's rule of thumb (see
## more at ../utils/estimateBW.py):
## load training data, execute estimation, replace the parameters in dict
trDataF = script_kscIchol_Tune['--trDataFile']
trData = np.loadtxt(trDataF, dtype=float)
#estedBW = gScaleBW*estimateBW.EstimateBW(trData, 'Silverman')
estedBW = gScaleBW*estimateBW.EstimateBW(trData, gTypeBW)
strBW = ",".join([np.format_float_scientific(h,precision=4) for h in estedBW])
## set the gNumFeatures dimensional, 1D estimated b.w. for the Inc-Col. part
script_kscIchol_Tune['--icholRBFKernelPar'] = strBW
## generate kernel parameter grid (we will have 50 values from 0.01xV to
## 100xV with log-spacing, where V is the mean (over the gNumFeatures
## diemsnions) Silverman rule of thumb estiates
meanBW = np.mean(estedBW)
kerPars = np.logspace(np.log10(0.05*meanBW), np.log10(50.0*meanBW), 30)
strPars = ",".join([np.format_float_scientific(h,precision=4) for h in kerPars])
script_kscIchol_Tune['--kernelParameters'] = strPars
##
## print
runMe = script_kscIchol_Tune['main'] + ' \\\n'
cntr = 1
for cmd, par in script_kscIchol_Tune.items():
if cmd != 'main':
if cntr!=len(script_kscIchol_Tune)-1:
runMe += ''.join([str(cmd), " ", str(par), ' \\\n'])
else:
runMe += ''.join([str(cmd), " ", str(par), ' \n'])
cntr += 1
## save the script to a .sh file
f = open("runKSCIchol_Tune.sh","w")
f.write(runMe)
f.close()
##
## print(" ==== (Python) === : Executing hyper-parameter tuning for the KSC-IChol model ...")
## os.system(runMe)
################################################################################
[docs]def GenerateScriptTesting():
print(" ==== (Python) === : Generating (initial) script for testing ...")
os.system('rm -f -r out')
os.system('mkdir out')
##
## The dictionary for generating the kscIchol_Tune input parameters
script_kscIchol_Test = {}
##
## the main program:
script_kscIchol_Test['main'] = gPathTo_kscIchol+'KscIchol_Test'
##
## the incomplete Cholesky decomposition related:
script_kscIchol_Test['--icholTolError'] = gIcholErrTol
script_kscIchol_Test['--icholMaxRank'] = gICholMaxRank
## this will be estimated below (using Silverman's rule of thumb or ISJ) !!!
script_kscIchol_Test['--icholRBFKernelPar'] = 1.0
##
## the training data set related:
script_kscIchol_Test['--trDataNumber'] = gNumTrain
script_kscIchol_Test['--trDataDimension'] = gNumFeatures
script_kscIchol_Test['--trDataFile'] = gNameDatTrain
##
## the test data set related:
script_kscIchol_Test['--tstDataNumber'] = gNumSamples
script_kscIchol_Test['--tstDataFile'] = gNameDat
##
## the clustering related:
script_kscIchol_Test['--clNumber'] = gNumClusters
## this will be estimated below (using Silverman's rule of thumb or ISJ) !!!
script_kscIchol_Test['--clRBFKernelPar'] = 0.1
script_kscIchol_Test['--clEncodingScheme'] = gClEncodScheme
script_kscIchol_Test['--clEvalWBalance'] = 0.5
script_kscIchol_Test['--clEvalOutlierThrs'] = 5
script_kscIchol_Test['--clLevel'] = 1
script_kscIchol_Test['--clResFile'] = gOutWorkPath+'/CRes.dat'
##
## other, optional
script_kscIchol_Test['--verbosityLevel'] = 2
script_kscIchol_Test['--numBLASThreads'] = 4
##
## estiamte optimal kernel bandwidth (using Silverman's rule of thumb or ISJ)
## (see more at ../utils/estimateBW.py):
## load training data, execute estimation, replace the parameters in dict
trDataF = script_kscIchol_Test['--trDataFile']
trData = np.loadtxt(trDataF, dtype=float)
estedBW = gScaleBW*estimateBW.EstimateBW(trData, gTypeBW)
strBW = ",".join([np.format_float_scientific(h,precision=4) for h in estedBW])
## set the gNumFeatures dimensional, 1D estimated b.w. for the Inc-Col. part
script_kscIchol_Test['--icholRBFKernelPar'] = strBW
## set the mean (over the gNumFeatures dimensions) for the clustering (i.e. 1D)
script_kscIchol_Test['--clRBFKernelPar'] = np.mean(estedBW)
##
## print
runMe = script_kscIchol_Test['main'] + ' \\\n'
cntr = 1
for cmd, par in script_kscIchol_Test.items():
if cmd != 'main':
if cntr!=len(script_kscIchol_Test)-1:
runMe += ''.join([str(cmd), " ", str(par), ' \\\n'])
else:
runMe += ''.join([str(cmd), " ", str(par), ' \n'])
cntr += 1
## save the script to a .sh file
f = open("runKSCIchol_Test.sh","w")
f.write(runMe)
f.close()
##
# print(" ==== (Python) === : Executing testing the KSC-IChol model ...")
# os.system(runMe)
################################################################################
[docs]def Help():
print ('kscICholTest2.py [-c #cluster centers] [-f #features] [-n #samples] ',
'[-t #training data] [-v #validation data] [-d min. distance between centers] ',
'[-s stdv of the underlying Gaussians] [-x scale the b.w. estimation result] ',
'[-e ichol error tolerance] [-r ichol max. rank] ',
'[-m cluster membership encoding-decoding scheme {\'AMS\',\'BAS\',\'BLF\'}]')
[docs]def main(argv):
try:
opts, args = getopt.getopt(argv,"hc:f:n:t:v:d:s:x:e:r:m:")
except getopt.GetoptError:
Help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
Help()
sys.exit()
elif opt in ("-c"):
global gNumClusters
gNumClusters = int(arg)
elif opt in ("-f"):
global gNumFeatures
gNumFeatures = int(arg)
elif opt in ("-n"):
global gNumSamples
gNumSamples = int(arg)
elif opt in ("-t"):
global gNumTrain
gNumTrain = int(arg)
elif opt in ("-v"):
global gNumValid
gNumValid = int(arg)
elif opt in ("-d"):
global gClusterDist
gClusterDist = float(arg)
elif opt in ("-s"):
global gClusterSTDV
isScaling = True
gClusterSTDV = float(arg)
elif opt in ("-x"):
global gScaleBW
gScaleBW = float(arg)
elif opt in ("-e"):
global gIcholErrTol
gIcholErrTol = min(1.0, max(0.0, float(arg)))
elif opt in ("-r"):
global gICholMaxRank
gICholMaxRank = max(1.0,float(arg))
elif opt in ("-m"):
global gClEncodScheme
if arg in ('AMS','BAS','BLF'):
gClEncodScheme = arg
else:
print(" *** cluster membership encoding-decoding scheme -m ",arg)
Help()
sys.exit(2)
# generate data and KSC application scripts with initial parameters
PrintParam()
GenerateData()
GenerateScriptTuning()
GenerateScriptTraining()
GenerateScriptTesting()
################################################################################
if __name__ == "__main__":
main(sys.argv[1:])