"""
.. _generateData_test3:
Generate data for clustering
=============================
Auxiliary fuctions for generating data, distributed in different shapes, for
clustering. The geneated clusters are tipically hard to detect and separate
(e.g. 4 intertwined spirals or concentric rings) for classical clustering
algorithms. The provided functionalities are used by :ref:`Test3 <sec_test3>`.
Description
-----------
Data are generated in 2 dimensions, clustered around 4 centers with different
`shapes`:
- ``4Clusters`` : distributed **normally around** the 4, random **point** centers
- ``4Moons`` : distributed around 4 **moon-shaped** centers
- ``4Circles`` : distributed around 4 **concentric circles** as centers
- ``4Spirals`` : distributed around 4 **intertwined spirals** as centers
These can be selected ny providing one of the above strings as input ragument.
The generated data will be **shuffled, standardised**. Sub-sets for `training`
and `validation` will aslo be selected accoridng to the sizes given as input
arguments. The generated `data set`, with the corresponding `labels` as well as
the `training` and `validation` sets will be saved into files at the location
specified by the corresponding input argument (see :ref:`Example <ex_generateData>`).
.. _ex_generateData:
Example
-------
The following example generates 100 000 data in 2 dimensions as 4 concentric
rings. The data will be shuffled, standardised and saved to the
:math:`\\texttt{output/data}\_\\texttt{4Circles.dat}` file together with the
:math:`\\texttt{output/data}\_\\texttt{4Circles}\_\\texttt{Train}\_\\texttt{N20000.dat}` and
:math:`\\texttt{output/data}\_\\texttt{4Circles}\_\\texttt{Valid}\_\\texttt{N20000.dat}` files containing the
20 000 and 80 0000 sub-sampled data for training and validation ::
GenData('4Circles', 'output', nSamples=100000, nTrain=20000, nValid=80000)
Plot of the generated data is not required.
-----
"""
import numpy as np
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from itertools import cycle, islice
## Generate a random n-class classification problem (with n=4)
[docs]def genKClusters (nSamples, nFeatures=2, nClusters=4, levSeparation=2, rndseed=0):
"""
Generates data for clustering that are normally distributed around the centers.
Args:
nSamples (int): number of sample points to generate
nFeatures (int): dimensions of the data points
nClusters (int): number of clusters to generate
levSeparation (float): level of cluster separation
rndseed (int): state of the random number generator
Returns:
(:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
"""
return sklearn.datasets.make_classification(
n_samples = nSamples,
n_features = nFeatures,
n_classes = nClusters,
n_clusters_per_class = 1,
class_sep = levSeparation,
flip_y = 0,
n_informative = nFeatures,
n_redundant = 0,
n_repeated = 0,
random_state = rndseed)
## Generate 4 moons
[docs]def genKMoons (nSamples, levNoise=0.1, rndseed=0):
"""
Generates data for clustering: 4, `moon-shape` clusters in 2 dimension.
Args:
nSamples (int) : number of sample points to generate
levNoise (float) : determines the `thickness` of the `moons`
rndseed (int) : state of the random number generator
Returns:
(:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
"""
nHalf = int(nSamples*0.5)
xData, yData = sklearn.datasets.make_moons (
n_samples = nHalf,
noise = levNoise,
random_state = rndseed)
xData[np.where(yData[:]==0),1] *= 2.5
xData[np.where(yData[:]==1),1] *= 2.5
#
d1 = sklearn.datasets.make_moons (
n_samples = nSamples-nHalf,
noise = levNoise,
random_state = rndseed+12345678)
# rotate, translate scale
tt = 90.0/180.0*3.1415
rot = np.array([[np.cos(tt),-np.sin(tt)],[np.sin(tt),np.cos(tt)]])
d2 = np.matmul(d1[0],rot)
d2[np.where(d1[1]==0),0] += 2.0 # push blue to +x
d2[np.where(d1[1]==0),1] += 0.25 #
d2[np.where(d1[1]==0),1] *= 3
d2[np.where(d1[1]==1),1] += 1.25
d2[np.where(d1[1]==1),0] -= 1.5 # push red to -x
d2[np.where(d1[1]==1),1] *= 3
# concatenate
xData = np.concatenate((xData,d2))
yData = np.concatenate((yData, d1[1]+2))
return xData, yData
[docs]def genKCircles(nSamples, levNoise=0.075, rndseed=0):
"""
Generates data for clustering: 4, concentric rings in 2 dimension.
Args:
nSamples (int) : number of sample points to generate
levNoise (float) : determines the `thickness` of the rings
rndseed (int) : state of the random number generator
Returns:
(:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
"""
nHalf = int(nSamples*0.5)
xData, yData = sklearn.datasets.make_circles(
n_samples = nHalf,
factor = 0.5,
noise = levNoise,
random_state = rndseed)
xData[np.where(yData[:]==0)] *= 6
xData[np.where(yData[:]==1)] *= 4
xd, yd = sklearn.datasets.make_circles(
n_samples = nSamples-nHalf,
factor = 0.2,
noise = levNoise*0.25,
random_state = rndseed+12345678)
xd[np.where(yd[:]==0)] *= 8
xd[np.where(yd[:]==1)] *= 2
#
xData = np.concatenate((xData, xd))
yData = np.concatenate((yData, yd+2))
return xData, yData
[docs]def genK4Spirals(nSamples, levNoise=0.1, rndseed=0):
"""
Generates data for clustering: 4, intertwined spirals in 2 dimension.
Args:
nSamples (int) : number of sample points to generate
levNoise (float) : determines the `thickness` of the spirals
rndseed (int) : state of the random number generator
Returns:
(:obj:`numpy::array`, :obj:`numpy::array`) : tuple containing the generated data and their labels
"""
clusterSize = (int)(nSamples/4);
cl = np.array([0,1,2,3])
xData = np.zeros((nSamples,2))
yData = np.zeros(nSamples, dtype=np.int32)
## r
r = np.linspace(0.05, 1.0, clusterSize)
cc = 3.1415/2
for c in cl:
# theta
t = np.linspace(c*cc, (c+5.0)*cc, clusterSize)
np.add(t, np.random.normal(0, levNoise, clusterSize), out=t)
#
st = c*clusterSize
ed = (c+1)*clusterSize
rr = np.arange(st,ed)
xData[rr,0] = r*np.sin(t)
xData[rr,1] = r*np.cos(t)
yData[rr] = c
return xData, yData
[docs]def GenData(name, outpath, nSamples=100000, nTrain=0, nValid=0, doPlot=False):
"""
Generates any of the 4 (blobs, moons, rings, spirals) data sets for clustering.
Generates 2D data with the required size in 4 clusters according to the
required shape of clusters. The data are shuffled and standardised. Data
sets for training and validation, with the required sizes, are sub-
sampled. The corresponding files are saved under the required location.
Args:
name (str) : one of {'4Clusters','4Moons','4Circles','4Spirals'}
outpath (str) : location where the geneated data files will be saved
nSamples (int) : number of sample points to generate
nTrain (int) : number of sample points for training
nValid (int) : number of sample points for validation
doPlot (bool) : flag to indicate if data should be plotted (visualised)
Yields:
Files, saved under the specified location, containing
- :math:`\\texttt{data}\_\\texttt{x.dat}` : the complete data set (data points as rows)
- :math:`\\texttt{data}\_\\texttt{x}\_\\texttt{Labels.dat}` : the corresponding cluster labels (for each row)
- :math:`\\texttt{data}\_\\texttt{x}\_\\texttt{Train}\_\\texttt{Ny.dat}` : the `y` sub-sampled data for training
- :math:`\\texttt{data}\_\\texttt{x}\_\\texttt{Valid}\_\\texttt{Nz.dat}` : the `z` sub-sampled data for validation
where `x` is one of the available data set names.
The generated data are also plotted in case it was required.
"""
if nTrain+nValid > nSamples:
print("**** Error: nTrain + nValid = ", nTrain, " + ", nValid, " = ", nTrain+nValid, " > nSample = ", nSample)
return
if name == '4Clusters':
xData, yData = genKClusters(nSamples)
elif name == '4Moons':
xData, yData = genKMoons(nSamples)
elif name == '4Circles':
xData, yData = genKCircles(nSamples)
elif name == '4Spirals':
xData, yData = genK4Spirals(nSamples)
else :
print("**** Error: unknown data set name ' ", name, " ' in generateData::GenData")
return
## shuffle to get random examples (store the state and set back)
np.random.seed(12345678)
st0 = np.random.get_state()
np.random.shuffle(xData)
np.random.set_state(st0)
np.random.shuffle(yData)
## sandardise
xData = StandardScaler().fit_transform(xData)
## ouput name
fNameDat = ''.join([outpath,'/data_',name,'.dat'])
fNameLab = ''.join([outpath,'/data_',name,'_Labels.dat'])
fNameDatTrain = ''.join([outpath,'/data_',name,'_Train_N',str(nTrain),'.dat'])
fNameDatValid = ''.join([outpath,'/data_',name,'_Valid_N',str(nValid),'.dat'])
np.savetxt(fNameDat, xData, fmt='%14.4e')
np.savetxt(fNameLab, yData, fmt='%d')
if nTrain > 0:
np.savetxt(fNameDatTrain, xData[0:nTrain,] , fmt='%14.4e')
if nValid > 0:
np.savetxt(fNameDatValid, xData[nTrain:nTrain+nValid,], fmt='%14.4e')
##
## if plotting was required
if doPlot:
nClusters = 4
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(nClusters + 1))))
# print (colors)
# add black color for outliers (if any)
colors = np.append(colors, ["#000000"])
plt.scatter(xData[:, 0], xData[:, 1], s=1, color=colors[yData])
plt.show()
#GenData('4Circles', 'output', nSamples=100000, nTrain=20000, nValid=80000)