#!/usr/bin/python import string import os from matplotlib.matlab import plot, title, savefig, figure, close, set, gca, xlabel, ylabel from MLab import cov, eig, array, innerproduct names = [ 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline' ] makescharts = True runsom = True maxvalues = [] minvalues = [] sumvalues = [] categories = [] def extractvalues( line ): values = string.split( string.strip( line ), "," ) categories.append( int( values[ 0 ] ) ) values = values[1:] values = map( float, values ) for i in range( 0, len( values ) ): v = values[ i ] if i < len( maxvalues ): maxvalues[ i ] = max( maxvalues[ i ], v ) minvalues[ i ] = min( minvalues[ i ], v ) sumvalues[ i ] = sumvalues[ i ] + v else: maxvalues.append( v ) minvalues.append( v ) sumvalues.append( v ) return values def normalisedata( values ): norm = [] for i in range( 0, len( values ) ): v = values[ i ]; maxv = maxvalues[ i ] minv = minvalues[ i ] norm.append( (v-minv)/(maxv-minv) ) return norm def totext( values ): return string.join( map( str, values ), " " ) + "\n" origfile = file( "wine.data" ) lines = origfile.readlines() origfile.close() values = map( extractvalues, lines ) # extract un-normalised values into the three categories values1 = map( extractvalues, filter( lambda x: string.index( x, "1" ) == 0, lines ) ) values2 = map( extractvalues, filter( lambda x: string.index( x, "2" ) == 0, lines ) ) values3 = map( extractvalues, filter( lambda x: string.index( x, "3" ) == 0, lines ) ) # plot charts of the raw data if makescharts: for i in range( 0, 13 ): x1 = map( lambda x: x[ i ], values1 ) x2 = map( lambda x: x[ i ], values2 ) x3 = map( lambda x: x[ i ], values3 ) for j in range( i+1, 13 ): y1 = map( lambda x: x[ j ], values1 ) y2 = map( lambda x: x[ j ], values2 ) y3 = map( lambda x: x[ j ], values3 ) figure() plot( x1, y1, 'ro' ) plot( x2, y2, 'gs' ) plot( x3, y3, 'b^' ) title( 'Wine Properties' ) xlabel( names[ i ] ) ylabel( names[ j ] ) savefig( str( i ) + "-" + str( j ) ) close() #normalise the data and write it to file for SOM to use normalisedlines = map( totext, map( normalisedata, values ) ) normalisedfile = open( "wine.data.norm", "w" ) normalisedfile.writelines( normalisedlines ) normalisedfile.close() if runsom: os.spawnlp( os.P_WAIT, './som', 'som', '178', '13', 'wine.data.norm', 'wine.som.config' ) coordsfile = open( "som.winners" ) coordlines = coordsfile.readlines() coordsfile.close(); x1 = [] y1 = [] x2 = [] y2 = [] x3 = [] y3 = [] for i in range( 0, len( coordlines ) ): xy = string.split( coordlines[ i ] ) x = int( xy[ 0 ] ) y = int( xy[ 1 ] ) c = categories[ i ] if c == 1: x1.append( x ) y1.append( y ) elif c == 2: x2.append( x ) y2.append( y ) elif c == 3: x3.append( x ) y3.append( y ) else: print "unknown category: " + c # use matplotlib functions to make the graph figure() plot( x1, y1, 'ro' ) plot( x2, y2, 'gs' ) plot( x3, y3, 'b^' ) set( gca(), 'xticks', range( 0, 12 ) ) set( gca(), 'yticks', range( 0, 12 ) ) title( "Wine SOM" ) savefig( "som" ) close() # PCA averages = array( map( lambda x: x/len( values ), sumvalues ) ) normvalues = map( normalisedata, values ) cenvalues = array( map( lambda x: array( x ) - averages, normvalues ) ) covmatrix = cov( cenvalues ) eigvalues,eigvectors = eig( covmatrix ) eigvlist = list( eigvalues ) eigvlist.sort() eigvlist.reverse() figure() plot( range( 0, len( eigvlist ) ), eigvlist, 'k-' ) set( gca(), 'xticks', range( 0, len( eigvlist ) ) ) title( 'Eigen Values' ) savefig( 'eigenvalues' ) close() # find vectors with largest eigen value eigval1 = 0 eigval2 = 0 eigvec1 = [] eigvec2 = [] for i in range( len( eigvalues ) ): eigval, eigvec = eigvalues[ i ], eigvectors[ i ] if eigval > eigval2: eigval2, eigvec2 = eigval1, eigvec1 eigval1, eigvec1 = eigval, eigvec print 'eigvalue1=' + str( eigval1 ) print eigvec1 print 'eigvalue2=' + str( eigval2 ) print eigvec2 x1 = [] y1 = [] x2 = [] y2 = [] x3 = [] y3 = [] for i in range( 0, len( coordlines ) ): v = cenvalues[ i ] c = categories[ i ] x = innerproduct( eigvec1, v ) y = innerproduct( eigvec2, v ) if c == 1: x1.append( x ) y1.append( y ) elif c == 2: x2.append( x ) y2.append( y ) elif c == 3: x3.append( x ) y3.append( y ) else: print "unknown category: " + c figure() plot( x1, y1, 'ro' ) plot( x2, y2, 'gs' ) plot( x3, y3, 'b^' ) title( "Wine PCA" ) savefig( "pca" ) close()