1 from numpy import array, append, vstack, transpose, reshape, \ 2 dot, true_divide, mean, exp, sqrt, log, \ 3 loadtxt, savetxt, zeros, frombuffer 4 from numpy.linalg import norm, lstsq 5 from multiprocessing import Process, Array 6 from random import sample 7 from time import time 8 from sys import stdout 9 from ctypes import c_double 10 from h5py import File 11 12 13 def metrics(a, b): 14 return norm(a - b) 15 16 17 def gaussian (x, mu, sigma): 18 return exp(- metrics(mu, x)**2 / (2 * sigma**2)) 19 20 21 def multiQuadric (x, mu, sigma): 22 return pow(metrics(mu,x)**2 + sigma**2, 0.5) 23 24 25 def invMultiQuadric (x, mu, sigma): 26 return pow(metrics(mu,x)**2 + sigma**2, -0.5) 27 28 29 def plateSpine (x,mu): 30 r = metrics(mu,x) 31 return (r**2) * log(r) 32 33 34 class Rbf: 35 def __init__(self, prefix = 'rbf', workers = 4, extra_neurons = 0, from_files = None): 36 self.prefix = prefix 37 self.workers = workers 38 self.extra_neurons = extra_neurons 39 40 # Import partial model 41 if from_files is not None: 42 w_handle = self.w_handle = File(from_files['w'], 'r') 43 mu_handle = self.mu_handle = File(from_files['mu'], 'r') 44 sigma_handle = self.sigma_handle = File(from_files['sigma'], 'r') 45 46 self.w = w_handle['w'] 47 self.mu = mu_handle['mu'] 48 self.sigmas = sigma_handle['sigmas'] 49 50 self.neurons = self.sigmas.shape[0] 51 52 def _calculate_error(self, y): 53 self.error = mean(abs(self.os - y)) 54 self.relative_error = true_divide(self.error, mean(y)) 55 56 def _generate_mu(self, x): 57 n = self.n 58 extra_neurons = self.extra_neurons 59 60 # TODO: Make reusable 61 mu_clusters = loadtxt('clusters100.txt', delimiter='\t') 62 63 mu_indices = sample(range(n), extra_neurons) 64 mu_new = x[mu_indices, :] 65 mu = vstack((mu_clusters, mu_new)) 66 67 return mu 68 69 def _calculate_sigmas(self): 70 neurons = self.neurons 71 mu = self.mu 72 73 sigmas = zeros((neurons, )) 74 for i in xrange(neurons): 75 dists = [0 for _ in xrange(neurons)] 76 for j in xrange(neurons): 77 if i != j: 78 dists[j] = metrics(mu[i], mu[j]) 79 sigmas[i] = mean(dists)* 2 80 # max(dists) / sqrt(neurons * 2)) 81 return sigmas 82 83 def _calculate_phi(self, x): 84 C = self.workers 85 neurons = self.neurons 86 mu = self.mu 87 sigmas = self.sigmas 88 phi = self.phi = None 89 n = self.n 90 91 92 def heavy_lifting(c, phi): 93 s = jobs[c][1] - jobs[c][0] 94 for k, i in enumerate(xrange(jobs[c][0], jobs[c][1])): 95 for j in xrange(neurons): 96 # phi[i, j] = metrics(x[i,:], mu[j])**3) 97 # phi[i, j] = plateSpine(x[i,:], mu[j])) 98 # phi[i, j] = invMultiQuadric(x[i,:], mu[j], sigmas[j])) 99 phi[i, j] = multiQuadric(x[i,:], mu[j], sigmas[j]) 100 # phi[i, j] = gaussian(x[i,:], mu[j], sigmas[j])) 101 if k % 1000 == 0: 102 percent = true_divide(k, s)*100 103 print(c, ': {:2.2f}%'.format(percent)) 104 print(c, ': Done') 105 106 # distributing the work between 4 workers 107 shared_array = Array(c_double, n * neurons) 108 phi = frombuffer(shared_array.get_obj()) 109 phi = phi.reshape((n, neurons)) 110 111 jobs = [] 112 workers = [] 113 114 p = n / C 115 m = n % C 116 for c in range(C): 117 jobs.append((c*p, (c+1)*p + (m if c == C-1 else 0))) 118 worker = Process(target = heavy_lifting, args = (c, phi)) 119 workers.append(worker) 120 worker.start() 121 122 for worker in workers: 123 worker.join() 124 125 return phi 126 127 def _do_algebra(self, y): 128 phi = self.phi 129 130 w = lstsq(phi, y)[0] 131 os = dot(w, transpose(phi)) 132 return w, os 133 # Saving to HDF5 134 os_h5 = os_handle.create_dataset('os', data = os) 135 136 def train(self, x, y): 137 self.n = x.shape[0] 138 139 ## Initialize HDF5 caches 140 prefix = self.prefix 141 postfix = str(self.n) + '-' + str(self.extra_neurons) + '.hdf5' 142 name_template = prefix + '-{}-' + postfix 143 phi_handle = self.phi_handle = File(name_template.format('phi'), 'w') 144 os_handle = self.w_handle = File(name_template.format('os'), 'w') 145 w_handle = self.w_handle = File(name_template.format('w'), 'w') 146 mu_handle = self.mu_handle = File(name_template.format('mu'), 'w') 147 sigma_handle = self.sigma_handle = File(name_template.format('sigma'), 'w') 148 149 ## Mu generation 150 mu = self.mu = self._generate_mu(x) 151 self.neurons = mu.shape[0] 152 print('({} neurons)'.format(self.neurons)) 153 # Save to HDF5 154 mu_h5 = mu_handle.create_dataset('mu', data = mu) 155 156 ## Sigma calculation 157 print('Calculating Sigma...') 158 sigmas = self.sigmas = self._calculate_sigmas() 159 # Save to HDF5 160 sigmas_h5 = sigma_handle.create_dataset('sigmas', data = sigmas) 161 print('Done') 162 163 ## Phi calculation 164 print('Calculating Phi...') 165 phi = self.phi = self._calculate_phi(x) 166 print('Done') 167 # Saving to HDF5 168 print('Serializing...') 169 phi_h5 = phi_handle.create_dataset('phi', data = phi) 170 del phi 171 self.phi = phi_h5 172 print('Done') 173 174 ## Algebra 175 print('Doing final algebra...') 176 w, os = self.w, _ = self._do_algebra(y) 177 # Saving to HDF5 178 w_h5 = w_handle.create_dataset('w', data = w) 179 os_h5 = os_handle.create_dataset('os', data = os) 180 181 ## Calculate error 182 self._calculate_error(y) 183 print('Done') 184 185 def predict(self, test_data): 186 mu = self.mu = self.mu.value 187 sigmas = self.sigmas = self.sigmas.value 188 w = self.w = self.w.value 189 190 print('Calculating phi for test data...') 191 phi = self._calculate_phi(test_data) 192 os = dot(w, transpose(phi)) 193 savetxt('iok3834.txt', os, delimiter='\n') 194 return os 195 196 @property 197 def summary(self): 198 return '\n'.join( \ 199 ['-----------------', 200 'Training set size: {}'.format(self.n), 201 'Hidden layer size: {}'.format(self.neurons), 202 '-----------------', 203 'Absolute error : {:02.2f}'.format(self.error), 204 'Relative error : {:02.2f}%'.format(self.relative_error * 100)]) 205 206 207 def predict(test_data): 208 mu = File('rbf-mu-212243-2400.hdf5', 'r')['mu'].value 209 sigmas = File('rbf-sigma-212243-2400.hdf5', 'r')['sigmas'].value 210 w = File('rbf-w-212243-2400.hdf5', 'r')['w'].value 211 212 n = test_data.shape[0] 213 neur = mu.shape[0] 214 215 mu = transpose(mu) 216 mu.reshape((n, neur)) 217 218 phi = zeros((n, neur)) 219 for i in range(n): 220 for j in range(neur): 221 phi[i, j] = multiQuadric(test_data[i,:], mu[j], sigmas[j]) 222 223 os = dot(w, transpose(phi)) 224 savetxt('iok3834.txt', os, delimiter='\n') 225 return os