Source code for generateData

"""
.. _generateData_test3:

Generate data for clustering
=============================

Auxiliary fuctions for generating data, distributed in different shapes, for 
clustering. The geneated clusters are tipically hard to detect and separate 
(e.g. 4 intertwined spirals or concentric rings) for classical clustering 
algorithms. The provided functionalities are used by :ref:`Test3 <sec_test3>`.


Description
-----------

Data are generated in 2 dimensions, clustered around 4 centers with different 
`shapes`: 

 - ``4Clusters`` : distributed **normally around** the 4, random **point** centers
 - ``4Moons``    : distributed around 4 **moon-shaped** centers 
 - ``4Circles``  : distributed around 4 **concentric circles** as centers
 - ``4Spirals``  : distributed around 4 **intertwined spirals** as centers

These can be selected ny providing one of the above strings as input ragument.
The generated data will be **shuffled, standardised**. Sub-sets for `training` 
and `validation` will aslo be selected accoridng to the sizes given as input 
arguments. The generated `data set`, with the corresponding `labels` as well as 
the `training` and `validation` sets will be saved into files at the location 
specified by the corresponding input argument (see :ref:`Example <ex_generateData>`).


.. _ex_generateData:

Example
-------

The following example generates 100 000 data in 2 dimensions as 4 concentric 
rings. The data will be shuffled, standardised and saved to the 
:math:`\\texttt{output/data}\_\\texttt{4Circles.dat}` file together with the 
:math:`\\texttt{output/data}\_\\texttt{4Circles}\_\\texttt{Train}\_\\texttt{N20000.dat}` and 
:math:`\\texttt{output/data}\_\\texttt{4Circles}\_\\texttt{Valid}\_\\texttt{N20000.dat}` files containing the 
20 000 and 80 0000 sub-sampled data for training and validation ::

   GenData('4Circles', 'output', nSamples=100000, nTrain=20000, nValid=80000)
  
Plot of the generated data is not required. 

-----

"""


import numpy as np
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from itertools import cycle, islice



## Generate a random n-class classification problem (with n=4)
[docs]def genKClusters (nSamples, nFeatures=2, nClusters=4, levSeparation=2, rndseed=0):
    """
    Generates data for clustering that are normally distributed around the centers.

    Args:     
        nSamples  (int): number of sample points to generate
        nFeatures (int): dimensions of the data points
        nClusters (int): number of clusters to generate
        levSeparation (float): level of cluster separation
        rndseed   (int): state of the random number generator

    Returns:
        (:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
    """
    return sklearn.datasets.make_classification(
              n_samples            = nSamples, 
              n_features           = nFeatures, 
              n_classes            = nClusters, 
              n_clusters_per_class = 1,
              class_sep            = levSeparation,
              flip_y               = 0,
              n_informative        = nFeatures, 
              n_redundant          = 0,
              n_repeated           = 0,
              random_state         = rndseed)

## Generate 4 moons 
[docs]def genKMoons (nSamples, levNoise=0.1, rndseed=0):
    """
    Generates data for clustering: 4, `moon-shape` clusters in 2 dimension.

    Args:     
        nSamples  (int)  : number of sample points to generate
        levNoise (float) : determines the `thickness` of the `moons`
        rndseed   (int)  : state of the random number generator

    Returns:
        (:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
    """    
    nHalf = int(nSamples*0.5)
    xData, yData = sklearn.datasets.make_moons (
      n_samples            = nHalf,
      noise                = levNoise, 
      random_state         = rndseed)
    xData[np.where(yData[:]==0),1] *= 2.5
    xData[np.where(yData[:]==1),1] *= 2.5
    #
    d1 = sklearn.datasets.make_moons (
      n_samples            = nSamples-nHalf, 
      noise                = levNoise,
      random_state         = rndseed+12345678)
    # rotate, translate scale 
    tt = 90.0/180.0*3.1415  
    rot = np.array([[np.cos(tt),-np.sin(tt)],[np.sin(tt),np.cos(tt)]])  
    d2 = np.matmul(d1[0],rot)
    d2[np.where(d1[1]==0),0] += 2.0   # push blue to +x
    d2[np.where(d1[1]==0),1] += 0.25  # 
    d2[np.where(d1[1]==0),1] *= 3 
    d2[np.where(d1[1]==1),1] += 1.25  
    d2[np.where(d1[1]==1),0] -= 1.5   # push red to -x
    d2[np.where(d1[1]==1),1] *= 3  
    # concatenate
    xData = np.concatenate((xData,d2))
    yData = np.concatenate((yData, d1[1]+2))
    return xData, yData

[docs]def genKCircles(nSamples, levNoise=0.075, rndseed=0):
    """
    Generates data for clustering: 4, concentric rings in 2 dimension.

    Args:     
        nSamples  (int)  : number of sample points to generate
        levNoise (float) : determines the `thickness` of the rings
        rndseed   (int)  : state of the random number generator

    Returns:
        (:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
    """        
    nHalf = int(nSamples*0.5)
    xData, yData = sklearn.datasets.make_circles(
      n_samples    = nHalf, 
      factor       = 0.5,
      noise        = levNoise,
      random_state = rndseed)     
    xData[np.where(yData[:]==0)] *= 6
    xData[np.where(yData[:]==1)] *= 4 
    xd, yd = sklearn.datasets.make_circles(
      n_samples    = nSamples-nHalf, 
      factor       = 0.2,
      noise        = levNoise*0.25,
      random_state = rndseed+12345678)
    xd[np.where(yd[:]==0)] *= 8
    xd[np.where(yd[:]==1)] *= 2
    #
    xData = np.concatenate((xData, xd))
    yData = np.concatenate((yData, yd+2))
    return xData, yData


[docs]def genK4Spirals(nSamples, levNoise=0.1, rndseed=0):
    """
    Generates data for clustering: 4, intertwined spirals in 2 dimension.

    Args:     
        nSamples  (int)  : number of sample points to generate
        levNoise (float) : determines the `thickness` of the spirals
        rndseed   (int)  : state of the random number generator

    Returns:
        (:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
    """    
    clusterSize = (int)(nSamples/4);
    cl    = np.array([0,1,2,3]) 
    xData = np.zeros((nSamples,2))
    yData = np.zeros(nSamples, dtype=np.int32)
    ## r
    r     = np.linspace(0.05, 1.0, clusterSize)
    cc    = 3.1415/2
    for c in cl:
        # theta
        t = np.linspace(c*cc, (c+5.0)*cc, clusterSize)
        np.add(t, np.random.normal(0, levNoise, clusterSize), out=t)
        #
        st = c*clusterSize
        ed = (c+1)*clusterSize
        rr = np.arange(st,ed)
        xData[rr,0] = r*np.sin(t)
        xData[rr,1] = r*np.cos(t)
        yData[rr]   = c
    return xData, yData



[docs]def GenData(name, outpath, nSamples=100000, nTrain=0, nValid=0, doPlot=False):
    """
    Generates any of the 4 (blobs, moons, rings, spirals) data sets for clustering.

    Generates 2D data with the required size in 4 clusters according to the 
    required shape of clusters. The data are shuffled and standardised. Data
    sets for training and validation, with the required sizes, are sub-
    sampled. The corresponding files are saved under the required location.

    Args:
        name      (str)  : one of {'4Clusters','4Moons','4Circles','4Spirals'}  
        outpath   (str)  : location where the geneated data files will be saved
        nSamples  (int)  : number of sample points to generate
        nTrain    (int)  : number of sample points for training
        nValid    (int)  : number of sample points for validation
        doPlot   (bool)  : flag to indicate if data should be plotted (visualised)

    Yields:        
        Files, saved under the specified location, containing
          
          - :math:`\\texttt{data}\_\\texttt{x.dat}`            : the complete data set (data points as rows)
          - :math:`\\texttt{data}\_\\texttt{x}\_\\texttt{Labels.dat}`    : the corresponding cluster labels (for each row)
          - :math:`\\texttt{data}\_\\texttt{x}\_\\texttt{Train}\_\\texttt{Ny.dat}` : the `y` sub-sampled data for training
          - :math:`\\texttt{data}\_\\texttt{x}\_\\texttt{Valid}\_\\texttt{Nz.dat}` : the `z` sub-sampled data for validation
        
        where `x` is one of the available data set names.  
        The generated data are also plotted in case it was required. 
    """        
    if nTrain+nValid > nSamples:
        print("****  Error: nTrain + nValid = ", nTrain, " + ", nValid, " = ", nTrain+nValid, " > nSample = ", nSample)
        return 
    if   name == '4Clusters':
         xData, yData = genKClusters(nSamples)   
    elif name == '4Moons':
         xData, yData = genKMoons(nSamples)
    elif name == '4Circles':
         xData, yData = genKCircles(nSamples)
    elif name == '4Spirals':
         xData, yData  = genK4Spirals(nSamples)
    else :
        print("**** Error: unknown data set name ' ", name, " ' in generateData::GenData")
        return
    ## shuffle to get random examples (store the state and set back)
    np.random.seed(12345678)
    st0 = np.random.get_state()
    np.random.shuffle(xData)
    np.random.set_state(st0)
    np.random.shuffle(yData)
    ## sandardise
    xData = StandardScaler().fit_transform(xData)
    ## ouput name 
    fNameDat      = ''.join([outpath,'/data_',name,'.dat'])
    fNameLab      = ''.join([outpath,'/data_',name,'_Labels.dat'])
    fNameDatTrain = ''.join([outpath,'/data_',name,'_Train_N',str(nTrain),'.dat'])
    fNameDatValid = ''.join([outpath,'/data_',name,'_Valid_N',str(nValid),'.dat'])
    np.savetxt(fNameDat, xData, fmt='%14.4e')
    np.savetxt(fNameLab, yData, fmt='%d')
    if nTrain > 0:
        np.savetxt(fNameDatTrain, xData[0:nTrain,]     , fmt='%14.4e')
    if nValid > 0:    
        np.savetxt(fNameDatValid, xData[nTrain:nTrain+nValid,], fmt='%14.4e')
    ##
    ## if plotting was required
    if doPlot:
        nClusters = 4
        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                              int(nClusters + 1))))
#        print (colors)                                      
        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])
        plt.scatter(xData[:, 0], xData[:, 1], s=1, color=colors[yData])
        plt.show()


#GenData('4Circles', 'output', nSamples=100000, nTrain=20000, nValid=80000)