Source code for evaluateClusteringRes

"""
.. _evaluateClusteringRes_doc:

Evaluate clustering result
===========================

Auxiliary script for evaluating the results of the clustering. 

After executing the KSC application for testing or training, the assigned  
cluster lables are available in a file (specified as input argument of the KSC
applications). This script can be used to evaluate this result using either 
the true labels (if available) or the data that have been clustered. The 
**Adjusted Rand Index (ARI)** can be computed in the first while the 
**Silhouette Score (SC)** in the second case.


Example
-------

If the clustering result is located in the in the :math:`\\texttt{out/CRes.dat}`
file and the true cluster labels are available and located in the 
:math:`\\texttt{data/data}\_\\texttt{Labels.dat}` file, then the ARI can be 
computed as ::

    python ../utils/evaluate.py -c out/CRes.dat -t data/data_Labels.dat 

In case the true cluster labels are not available and the data used to cluster 
is located in the :math:`\\texttt{out/data.dat}` file, one can use the script to 
compute the SC as ::

    python ../utils/evaluate.py -c out/CRes.dat -d data/data.dat -s 

.. note:: Computing the **Silhouette Score** might take a long time in case of 
   lage data sets.
  
"""

import os
import sys
import getopt

import numpy as np

from sklearn.metrics    import silhouette_score
from sklearn.metrics    import adjusted_rand_score



## cdata - data that was clustered
## cres  - results of clustering
## truel - the corresponding true lables
[docs]def EvaluateClustering(cResFile, trueLFile, dataFile, compShil): if cResFile != '': cres = ((np.loadtxt(cResFile))[:,0]).astype(int).flatten() if trueLFile != '': truel = np.loadtxt(trueLFile).astype(int).flatten() if dataFile != '': cdata = np.loadtxt(dataFile) ## ## compute the silhouette_score:requires the data and the lables from clustering if compShil: print(" ---- (Python) --- : Computing Silhouette-Score ... (can take long...)") silhouetteScore = silhouette_score(cdata, cres, metric='euclidean') print (" ===> The Silhouette-Score = {0:5.3f}".format(silhouetteScore)) ## compute the adjusted rand index: requires true and clustering results labels if trueLFile != '': print(" ---- (Python) --- : Computing Adjusted-Rand-Score ...") adjustedRandScore = adjusted_rand_score(truel, cres) print (" ===> The Adjusted Rand-Score = {0:5.3f}".format(adjustedRandScore))
################################################################################
[docs]def main(argv): cResFile = '' trueLFile = '' # optional dataFile = '' # optional compShil = False # requires data and cres try: opts, args = getopt.getopt(argv,"hc:t:d:s") except getopt.GetoptError: print ('evaluateClusteringRes.py -c <cluster label file: result> [-t cluster label file: true] [-d data file] [-s compute silhouette score]') sys.exit(2) for opt, arg in opts: if opt == '-h': print ('evaluateClusteringRes.py -c <cluster label file: result> [-t cluster label file: true] [-d data file] [-s compute silhouette score] [-x compute complete table]') sys.exit() elif opt in ("-c"): cResFile = arg elif opt in ("-t"): trueLFile = arg elif opt in ("-d"): dataFile = arg elif opt in ("-s"): compShil = True print(" ==== (Python) === : Evaluating clustering result ...") if cResFile == '': print(" -c <cluster label file: result> is a required argument") exit() if compShil and dataFile == '': print(" [-d data file] (clustered data) is required for the silhouette score") exit() if not compShil and trueLFile == '': print(" [-t cluster label file: true] is required for the adjusted rand score") exit() EvaluateClustering(cResFile, trueLFile, dataFile, compShil)
################################################################################ if __name__ == "__main__": main(sys.argv[1:])