Recipe 578853 revision 1 « ActiveState Code

# -*- coding: utf-8 -*-
############################################################################### 
# KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM)                 # 
# by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED)                 # 
# Language used: PYTHON                               .                       # 
############################################################################### 
import os
import random
import copy
import datetime

def mean(x):     	# mean
  n = len(x)
  mean = sum(x) / n
  return mean

def sd(x):		# standard deviattion
  n = len(x)
  mean = sum(x) / n
  sd = (sum((x-mean)**2 for x in x) / n) ** 0.5
  return sd

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

class ndim:             # from 3D array to flat array
    def __init__(self,x,y,z,d):
        self.dimensions=[x,y,z]
        self.numdimensions=d
        self.gridsize=x*y*z

    def getcellindex(self, location):
        cindex = 0
        cdrop = self.gridsize
        for index in xrange(self.numdimensions):
            cdrop /= self.dimensions[index]
            cindex += cdrop * location[index]
        return cindex

    def getlocation(self, cellindex):
        res = []
        for size in reversed(self.dimensions):
            res.append(cellindex % size)
            cellindex /= size
        return res[::-1]

""" how to use ndim class
n=ndim(4,4,5,3)
print n.getcellindex((0,0,0))
print n.getcellindex((0,0,1))
print n.getcellindex((0,1,0))
print n.getcellindex((1,0,0))

print n.getlocation(20)
print n.getlocation(5)
print n.getlocation(1)
print n.getlocation(0)
"""

print("###############################################################################")
print("# KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM)                 #")
print("# by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED)                 #")
print("# Language used: PYTHON                                                       #")
print("###############################################################################") 

# input and run parameters
error = 0

while True:
  arch_input = raw_input('InputFile                              : ')
  if not os.path.isfile(arch_input):
    print("Oops! File does not exist. Try again... or CTR/C to exit")     
  else:
    break

while True:
  try:
    num_gruppi = int(raw_input('Number of Groups (3 - 20)              : '))
  except ValueError:
    print("Oops!  That was no valid number.  Try again...")
  else:
    if(num_gruppi < 3):
      print("Oops! Number of Groups too low. Try again...")
    else:
      if(num_gruppi > 20):
        print("Oops! Number of Groups too big. Try again...")
      else:
        break

while True:
  normaliz   = raw_input('Normalization(Max, Std, None)          : ')
  normaliz   = normaliz.upper()
  normaliz   = normaliz[0]
  if(normaliz <> 'M' and normaliz <> 'S' and normaliz <> 'N'):
    print("Oops! Input M, S or N. Try again...")
  else:
    break

while True:
  try:
    max_alpha   = float(raw_input('Start value of alpha (1.8 - 0.9)       : '))
  except ValueError:
    print("Oops!  That was no valid number.  Try again...")
  else:
    if(max_alpha > 1.8):
      print("Oops! Start value of alpha too big. Try again...")
    else:
      if(max_alpha < 0.9):
        print("Oops! Start value of alpha too low. Try again...")
      else:
        break

while True:
  try:
    min_alpha  = float(raw_input('End value of alpha (0.5 - 0.0001)      : '))
  except ValueError:
    print("Oops!  That was no valid number.  Try again...")
  else:
    if(min_alpha > 0.5):
      print("Oops! alpha too big. Try again...")
    else:
      if(min_alpha < 0.0001):
        print("Oops! alpha too low. Try again...")
      else:
        break

while True:
  try:
    step_alpha  = float(raw_input('Decreasing step of alpha (0.1 - 0.001) : '))
  except ValueError:
    print("Oops!  That was no valid number.  Try again...")
  else:
    if(step_alpha > 0.1):
      print("Oops! Decreasing step of alpha too big. Try again...")
    else:
      if(step_alpha < 0.001):
        print("Oops! Decreasing step of alpha too low. Try again...")
      else:
        break


file_input   = arch_input 
gruppi_num   = num_gruppi
tipo_norm    = normaliz 
alpha_min    = min_alpha 
alpha_max    = max_alpha
alpha_step   = step_alpha

# outputs files
file_input   = arch_input
tipo_norm    = normaliz
gruppi_num   = num_gruppi
nome_input   = file_input.split(".") 
arch_output  = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_out.txt" 
arch_outsrt  = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_outsrt.txt" 
arch_sort    = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_sort.txt" 
arch_catal   = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_catal.txt" 
arch_medsd   = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_medsd.txt" 
arch_cv      = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_cv.txt" 
arch_grid    = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_grid.txt" 
arch_log     = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_log.txt" 

# start time
t0 = datetime.datetime.now()

# read input file 
arr_r    = [] 
arr_orig = [] 
arr_c    = [] 
mtchx    = [] 
mtchy    = [] 
txt_col  = [] 
xnomi    = [] 

# the numbers of variables / columns in all record must be the same 
n_rows = 0 
n_cols = 0 
err_cols = 0 
index = 0 
for line in open(file_input).readlines():
  linea = line.split()
  if(index == 0):
    xnomi.append(linea)
    n_cols = len(linea)
  else:
    arr_r.append(linea)
    if(len(linea) != n_cols):
      err_cols = 1
      print("Different numbers of variables / columns in the record " + str(index)
        + " cols " + str(len(linea))) 
  index += 1
if(err_cols == 1):
  print("File " + file_input + " contains errors. Exit ")
  quit()
index = 0
while index < len(arr_r):
  linea = arr_r[index]
  index_c = 0
  while index_c < len(linea):
    if linea[index_c].isdigit():
      linea[index_c] = float(linea[index_c])
    index_c += 1
  arr_r[index] = linea 
  index += 1
arr_orig = copy.deepcopy(arr_r)        # original input file
testata_cat = copy.deepcopy(xnomi[0])  # original header row

# finding columns containing strings and columns containing numbers
testata = xnomi[0]
testata_orig = copy.deepcopy(xnomi[0])
n_cols = len(testata) - 1
n_rows = len(arr_r)
ind_c  = 1
err_type = 0
while ind_c < len(testata):
  ind_r    = 1
  tipo_num = 0 
  tipo_txt = 0 
  while ind_r < len(arr_r):

    arr_c = arr_r[ind_r]

    if is_number(arr_c[ind_c]):
      tipo_num = 1
    else:
      tipo_txt = 1

    ind_r += 1

  if tipo_num == 1 and tipo_txt == 1:
    print "The columns / variables " + testata[ind_c] + " contains both strings and numbers." 
    print arr_c
    err_type = 1 
  ind_c += 1
if err_type == 1:
  print "Oops! The columns / variables contains both strings and numbers. Exit. " 
  quit()

index_c = 1 
while index_c <= n_cols:
  txt_col = [] 
  index = 0 
  while index < len(arr_r):
    arr_c = arr_r[index] 
    if(isinstance(arr_c[index_c],str)):
      txt_col.append(arr_c[index_c]) 
    index += 1 
  set_txt_col = set(txt_col)             # remove duplicates 
  txt_col = list(set(set_txt_col))
  txt_col.sort()
  
  # from strings to numbers 
  if(len(txt_col) > 0): 
    if(len(txt_col) > 1): 
      passo1 = 1.0 / (len(txt_col) - 1) 
    else:
      passo1 = 0.0
    index = 0
    while index < len(arr_r): 
      arr_c = arr_r[index] 
      campo1 = arr_c[index_c] 
      indice1 = txt_col.index(campo1) 
      if(len(txt_col) == 1):  # same values in the column
        val_num1 = float(1) 
      else: 
        val_num1 = float(passo1 * indice1)
      arr_c[index_c] = val_num1 + 0.00000001   # to avoid zero values in means
                                               # (to prevent zero divide in CV)
      index += 1 
  index_c += 1 

# means, max & std 
xmeans = [] 
xmaxs  = [] 
xmins  = []            ### aggiunto Roberto 4/03/2012
xsds   = [] 
xcv    = [] 
index_c = 0
while index_c <= n_cols:
  xmeans.append(0.0) 
  xmaxs.append(-9999999999999999.9) 
  xmins.append(9999999999999999.9)	### aggiunto Roberto 4/03/2012
  xsds.append(0.0) 
  xcv.append(0.0)
  index_c += 1 

# means & max
index = 0 
while index < n_rows:
  arr_c = arr_r[index] 
  index_c = 1 
  while index_c <= n_cols:
    xmeans[index_c] += arr_c[index_c] 
    if(arr_c[index_c] > xmaxs[index_c]):
      xmaxs[index_c] = arr_c[index_c] 
    index_c += 1 
  index += 1 
index_c = 1 
while index_c <= n_cols:
  xmeans[index_c] = xmeans[index_c] / n_rows
  index_c += 1 

# std
index = 0 
while index < n_rows:
  arr_c = arr_r[index] 
  index_c = 1 
  while index_c <= n_cols:
    xsds[index_c] += (arr_c[index_c] - xmeans[index_c])**2 
    index_c += 1 
  index += 1 
index_c = 1 

while index_c <= n_cols:
  xsds[index_c] = (xsds[index_c] / (n_cols - 1)) ** 0.5 
  index_c += 1 

# Means, Max, Std, CV output file
medsd_file = open(arch_medsd, 'w')
  
# columns names
medsd_file.write('%s %s ' % ('Function' , "\t"))
index_c = 1 
while index_c <= n_cols:
  medsd_file.write('%s %s ' % (testata[index_c], "\t"))
  index_c += 1
medsd_file.write('%s' % ('\n'))

# means
medsd_file.write('%s %s ' % ('Mean' , "\t"))
index_c = 1 
while index_c <= n_cols:
  valore = str(xmeans[index_c]) 
  valore = valore[0:6] 
  medsd_file.write('%s %s ' % (valore, "\t")) 
  index_c += 1 
medsd_file.write('%s' % ('\n'))

# max
medsd_file.write('%s %s ' % ('Max' , "\t"))
index_c = 1 
while index_c <= n_cols:
  valore = str(xmaxs[index_c]) 
  valore = valore[0:6] 
  medsd_file.write('%s %s ' % (valore, "\t")) 
  index_c += 1 
medsd_file.write('%s' % ('\n'))

# std
medsd_file.write('%s %s ' % ('Std' , "\t"))
index_c = 1 
while index_c <= n_cols:
  valore = str(xsds[index_c]) 
  valore = valore[0:6] 
  medsd_file.write('%s %s ' % (valore, "\t")) 
  index_c += 1 
medsd_file.write('%s' % ('\n'))

# CV 
medsd_file.write('%s %s ' % ('CV' , "\t"))
index_c = 1 
med_cv_gen = 0.0 		# cv average of all columns / variables
while index_c <= n_cols:
  if xmeans[index_c] == 0: 
    media1 = 0.000001 
  else:
    media1 = xmeans[index_c] 
  xcv[index_c] = xsds[index_c] / abs(media1) 
  valore = str(xcv[index_c])
  med_cv_gen += xcv[index_c]
  valore = valore[0:6] 
  medsd_file.write('%s %s ' % (valore, "\t")) 
  index_c += 1 
med_cv_gen = med_cv_gen / n_cols
str_med_cv_gen = str(med_cv_gen)
str_med_cv_gen = str_med_cv_gen[0:6]
medsd_file.write('%s' % ('\n'))
medsd_file.close()

# input standardization

# standardization on max

if tipo_norm == 'M':
  index = 0
  while index < n_rows:
    arr_c = arr_r[index] 
    index_c = 1 
    while index_c <= n_cols:    ## aggiornare anche kb_cla.py
      if xmaxs[index_c] == 0.0:
        xmaxs[index_c] = 0.00001 
      arr_c[index_c] = arr_c[index_c] / xmaxs[index_c] 
      index_c += 1 
    index += 1 

# standardization on std

if tipo_norm == 'S':
  index = 0
  while index < n_rows: 
    arr_c = arr_r[index] 
    index_c = 1 
    while index_c <= n_cols:    
      if xsds[index_c] == 0.0:
        xsds[index_c] = 0.00001 
      arr_c[index_c] = (arr_c[index_c] - xmeans[index_c]) / xsds[index_c]
      if arr_c[index_c] < xmins[index_c]:	### aggiunto Roberto 4/03/2012
        xmins[index_c] = arr_c[index_c]       	### aggiunto Roberto 4/03/2012
      index_c += 1 
    index += 1 
  # aggiungo xmins per eliminare i valori negativi (aggiunto da Roberto 4/03/2012)
  index = 0
  while index < n_rows: 
    arr_c = arr_r[index] 
    index_c = 1 
    while index_c <= n_cols:    
      arr_c[index_c] = arr_c[index_c] - xmins[index_c]
      print arr_c[index_c]
      index_c += 1 
    index += 1 
  # fine aggiunta da Roberto 4/03/2012

# start of kohonen algorithm 

# min and max vectors

vmaxs = []
vmins = []

index_c = 0

while index_c <= n_cols:
  vmaxs.append(-10000000000000.0) 
  vmins.append( 10000000000000.0) 
  index_c += 1 

# columns min & max 
index = 0
while index < n_rows:
  arr_c = arr_r[index] 
  index_c = 1 
  while index_c <= n_cols:
    if arr_c[index_c] > vmaxs[index_c]:
      vmaxs[index_c] = arr_c[index_c] 
    if arr_c[index_c] < vmins[index_c]:
      vmins[index_c] = arr_c[index_c] 
    index_c += 1 
  index += 1 

# run parameters and temp arrays
  
n = n_rows
m = n_cols
nx = gruppi_num 
ny = gruppi_num 
ix = 950041                         # integer as random seed
nsteps = int(10000 * nx * ny)       # number of steps
nepoks = int(nsteps / n ** 0.5)     # number of epochs
unit_calc = int(n * m * nx * ny)    # running units
passo = int(5000 / n)               # step of visualization on monitor
rmax = nx - 1
rmin = 1.0 

if passo < 1:
  passo = 1 
grid = []			    # training grid
index = 0
while index < nx * ny * m:
  grid.append(0.0)
  index += 1
n=ndim(nx,ny,m,3)
random.seed(ix)	       		    # initial value of random seed to obtain the same sequences in new runs
index = 0 
while index < nx: 
  index_c = 0 
  while index_c < ny:
    index_k = 0 
    while index_k < m: 
      ig = n.getcellindex((index,index_c,index_k))
      grid[ig] = random.random()
      index_k += 1 
    index_c += 1 
  index += 1 
gridp = copy.deepcopy(grid)     # initial previous grid = current grid 
gridm = copy.deepcopy(grid)     # initial min grid = current grid 

# for each record in each epoch 
iter 	  = 0 
discrea   = 1000000000000.0 	# current error
discrep   = 0.0 		# previous error
if nepoks < 20: 
  nepoks  = 20 			# min epochs = 20
nepokx    = 0
min_epok  = 0          		# epoch with min error
min_err   = 1000000000.0    	# min error
alpha     = float(alpha_max)   	# initial value of alpha parameter
ir        = 0.0                 # initial value of ir parameter ir
ne        = 1 

print " " 
print 'Record ' + str(n_rows) + ' Columns ' + str(n_cols)

# main loop
try:
  while ne <= nepoks: 
    if (ne % passo == 0):  # print running message when modulo division = zero
      min_err_txt = "%14.5f" % min_err    # format 8 integers and 3 decimals 
      alpha_txt  = "%12.5f" % alpha       # format 6 integers and 5 decimals 
      print ('Epoch ' + str(ne) + '   min err ' + min_err_txt + '   min epoch ' + 
          str(min_epok - 1) + "   alpha " + alpha_txt)
    if min_err < 1000000000.0: 
      nepokx += 1 
    if min_err > discrea and discrep > discrea and discrea > 0.0: 
      min_epok = ne               # current epoch (min) 
      min_err = discrea 
      # copy current grid to min grid
      gridm = copy.deepcopy(grid)       
      min_err_txt = "%12.3f" % min_err    # format 8 integers and 3 decimals 
      alpha_txt  = "%12.5f" % alpha       # format 6 integer and  5 decimals 
      print ('**** Epoch ' + str(ne - 1) + '       WITH MIN ERROR ' + min_err_txt + 
         "   alpha " + alpha_txt)
      
    # cheking the current value of alpha 
    if alpha > alpha_min:
      discrea = discrep 
      discrep = 0.0 
      # copy current grid to previous grid
      gridp = copy.deepcopy(grid)       
  
      # from the starting row to the ending row
      i = 0 
      while i < n_rows:
        iter += 1 
        # find the best grid coefficient
        ihit = 0 
        jhit = 0 
        dhit = 100000.0 
        igx = 0
        igy = 0
        while igx < nx: 
          igy = 0
          while igy < ny: 
            d = 0.0 
            neff = 0 
            k = 0
            arr_c = arr_r[i] 
            while k < m:   # update the sum of squared deviation of input 
                           # value from the grid coefficient
              ig = n.getcellindex((igx,igy,k))
              d = d + (arr_c[k+1] - grid[ig]) ** 2 
              k += 1 
            d = d / float(m)
            #  d = d / m 
            if d < dhit:
              dhit = d 
              ihit = int(igx)
              jhit = int(igy)
            igy += 1 
          igx += 1 
        # update iteration error
        discrep = discrep + dhit 
        # now we have the coordinates of the best grid coefficient
        ir = max(rmax * float(1001 - iter) / 1000.0 + 0.9999999999 , 1) 
        ir = int(ir) 
        # new alpha value to increase the radius of groups proximity 
        alpha = max(alpha_max * float(1 - ne * alpha_step) , alpha_min)
        # update the grid coefficients applying alpha parameter
        inn0 = int(ihit) - int(ir) 
        inn9 = int(ihit) + int(ir)
        jnn0 = int(jhit) - int(ir)
        jnn9 = int(jhit) + int(ir)
        while inn0 <= inn9: 
          jnn0 = int(jhit) - int(ir)
          while jnn0 <= jnn9:
            if not (inn0 < 0 or inn0 >= nx):
              if not (jnn0 < 0 or jnn0 >= ny): 
                arr_c = arr_r[i] 
                k = 0 
                while k < m:
                  ig = n.getcellindex((inn0,jnn0,k))
                  grid[ig] += alpha * (arr_c[k+1] - grid[ig])
                  k += 1 
            jnn0 += 1 
          inn0 += 1 
        i += 1 
    else: 
      print
      print "Min alpha reached " 
      print
      break   
    ne += 1 
except KeyboardInterrupt:
  print
  print "KeyboardInterrupt (Ctrl/C) " 
  print
  pass

# computing results
# grid = grid min 
grid = copy.deepcopy(gridm)       

# write min grid file 
arch_grid_file = open(arch_grid, 'w')
ii = 0 
while ii < nx:
  j = 0
  while j < ny:
    k = 0
    while k < m:
      ig = n.getcellindex((ii,j,k))
      arch_grid_file.write('%6i %s %.6i %s %.6i %s %14.7f %s' % (ii,' ', j ,' ', k,' ', grid[ig], "\n"))
      k += 1
    j += 1
  ii += 1
arch_grid_file.close()

# catalog input by min grid
ii = 0
while ii < n_rows: 
  ihit = 0 
  jhit = 0 
  dhit = 100000.0 
  # from 1 to numbers of groups
  ir = 0
  while ir < nx:         # from 1 to numbers of groups 
    jc = 0
    while jc < ny:       # from 1 to numbers of groups 
      d = 0.0 
      neff = 0 
      k = 0
      while k < n_cols:  # update the sum of squared deviation of input 
                         # value from the grid coefficient
        arr_c = arr_r[ii] 
        ig = n.getcellindex((ir,jc,k))
        d = d + (arr_c[k+1] - grid[ig]) ** 2 
        k += 1 
      d = d / m 
      if d < dhit:       # save the coordinates of the best coefficient 
        dhit = d 
        ihit = ir 
        jhit = jc 
      jc += 1 
    ir += 1 
  mtchx.append(ihit)
  mtchy.append(jhit) 
  ii += 1 

# write arch_catal file 
arch_catal_file = open(arch_catal, 'w')
ii = 0 
while ii < n_rows: 
  arch_catal_file.write("%.6i %s %.6i %s %.6i %s" % (ii, ' ', mtchx[ii], ' ', mtchy[ii], "\n")) 
  ii += 1 
arch_catal_file.close()

# matrix of statistics 
arr_cv   = []   	       # CV array of the Groups and Total
arr_med  = []      	       # means array of the Groups
riga_cv  = []          	       # CV row in arr_cv 
arr_col  = []          	       # group temporary array
arr_grsg = []         	       # input data array (normalized)
arr_grsg_c = []                # copy of arr_grsg (for file out sort) 

# input matrix sort in group sequence
ii = 0
ix = 0
while ii < n_rows: 
  ix += 1
  gr1 = str(mtchx[ii])
  if mtchx[ii] < 10:
    gr1 = '0' + str(mtchx[ii])
  sg1 = str(mtchy[ii])
  if mtchy[ii] < 10: 
    sg1 = '0' + str(mtchy[ii])
  riga_norm = arr_r[ii]
  im = 0
  riga_norm1 = []
  while im <= m: 
    riga_norm1.append(str(riga_norm[im]))
    im += 1
  riga_norm2 = " ".join(riga_norm1)  
  gr_sg_txt = "G_" + gr1 + "_" + sg1 + " " + str(ix) + " " + riga_norm2
  arr_grsg.append(gr_sg_txt) 
  ii += 1 
arr_grsg.sort()
ii = 0 
while ii < n_rows:
  arr_grsg_c.append(arr_grsg[ii]) 
  ii += 1 

# setup of arr_cv matrix 
num_gr = 0 
gruppo0 = "" 
ir = 0
while ir < n_rows:
  grsg_key = arr_grsg_c[ir].split()
  if not grsg_key[0] == gruppo0:
    gruppo0 = grsg_key[0]
    num_gr +=1 
    ic = 1 
    riga1 = [] 
    riga1.append(grsg_key[0]) 
    while ic <= m + 2:          # adding new columns for row mean and  n° of records
      riga1.append(0.0) 
      ic += 1 
    arr_cv.append(riga1)        # cv row 
  ir += 1 
riga1 = []
riga1.append("*Means*") 	# adding new row for cv mean
ic = 1
while ic <= m + 2:          	# adding new column for row mean and n° of records
  riga1.append(0.0) 
  ic += 1 
arr_cv.append(riga1)

def found(x):
  ir = 0
  while ir < len(arr_cv):
    linea_cv = arr_cv[ir]
    key_cv = linea_cv[0]
    if key_cv == x:
      return ir
    ir += 1

ir  = 0
irx = len(arr_grsg_c)
ic  = 3
linea_cv = arr_cv[0]
icx = len(linea_cv)
val_col = []

while ic < icx:
  ir = 0
  gruppo  = ""
  val_col = []
  while ir < irx:
    linea = arr_grsg_c[ir].split()
    if linea[0] == gruppo or gruppo == "":
      gruppo = linea[0] 
      val_col.append(float(linea[ic]))
    else:
      i_gruppo = found(gruppo)
      linea_cv = arr_cv[i_gruppo]
      media_v = abs(mean(val_col))
      if media_v == 0.0:
         media_v = 0.0000000001
      std_v = sd(val_col)
      cv_v  = std_v / media_v
      linea_cv[ic-2] = cv_v                      # cv value
      linea_cv[len(linea_cv)-1] = len(val_col)   # number of records
      val_col = []
      val_col.append(float(linea[ic]))
      gruppo = linea[0]
    ir += 1
  i_gruppo = found(gruppo)
  linea_cv = arr_cv[i_gruppo]
  media_v = abs(mean(val_col))
  if media_v == 0.0:
    media_v = 0.0000000001
  std_v = sd(val_col)
  cv_v  = std_v / media_v
  linea_cv[ic-2] = cv_v                          # cv value
  linea_cv[len(linea_cv)-1] = len(val_col)       # number of records
  ic += 1
ir  = 0
irx = len(arr_cv)
linea_cv = arr_cv[0]
icx = len(linea_cv) - 2
ic  = 1
num_rec1 = 0

while ir < irx:                                  # rows mean
  media_riga = 0.0
  ic = 1
  num_col1 = 0 
  linea_cv = arr_cv[ir]
  while ic < icx:
    media_riga += float(linea_cv[ic])
    num_col1 += 1
    ic += 1
  linea_cv[icx] = media_riga / num_col1
  num_rec1 += linea_cv[icx + 1]
  ir += 1
ir  = 0
ic  = 1

while ic < icx:                  # weighted mean of columns
  media_col = 0.0
  ir = 0
  num_rec1 = 0 
  while ir < irx - 1:
    linea_cv = arr_cv[ir]
    media_col = media_col + linea_cv[ic] * linea_cv[icx+1]  # linea_cv[icx+1] = number of records
    num_rec1 = num_rec1 + linea_cv[icx+1]
    ir += 1
  linea_cv = arr_cv[irx - 1]
  linea_cv[ic] = media_col / num_rec1
  ic += 1

# updating mean of the row
linea_cv = arr_cv[irx - 1]
linea_means = linea_cv[1:icx]
media_riga  = mean(linea_means)
linea_cv[icx] = media_riga        # Total mean
linea_cv[icx + 1] = num_rec1      # n° of records
cv_media_gen_after = str(media_riga)
cv_media_gen_after = cv_media_gen_after[0:6]  

# write cv  file 
testata_cv = testata
testata_cv[0] = "*Groups*"
testata_cv.append("*Mean*")
testata_cv.append("N_recs")
arch_cv_file = open(arch_cv, 'w')
ic = 0
while ic <= icx + 1:
  arch_cv_file.write('%s %s ' % (testata_cv[ic], " "*(9-len(testata_cv[ic]))))
  ic += 1
arch_cv_file.write('%s' % ('\n'))
ir = 0 
while ir < irx:
  ic = 0
  linea_cv = arr_cv[ir]
  while ic <= icx + 1:
    if ic == 0:
      arch_cv_file.write('%s %s ' % (linea_cv[0], "  "))
    else:
      if ic <= icx:
        arch_cv_file.write('%7.4f %s ' % (linea_cv[ic], "  "))
      else:
        arch_cv_file.write('%6i %s ' % (linea_cv[ic], "  "))
    ic += 1
  arch_cv_file.write('%s' % ("\n"))
  ir += 1
ic = 0

media_xcv = mean(xcv[1:icx])

while ic <= icx :   # print CV input (before catalogue)  
  if ic == 0:
    arch_cv_file.write('%s %s ' % ("*CVinp*", "  "))
  else:
    if ic < icx:
      arch_cv_file.write('%7.4f %s ' % (xcv[ic], "  "))
    else:
      arch_cv_file.write('%7.4f %s ' % (media_xcv, "  "))
      arch_cv_file.write('%6i %s ' % (linea_cv[ic+1], "  "))
  ic += 1
arch_cv_file.write('%s' % ("\n"))
#=========istruzioni aggiunte Roberto Bello 29/02/2012======================
#know_index = str(1.0 - float(cv_media_gen_after) / float(str_med_cv_gen))	
#know_index = know_index[0:6]
#arch_cv_file.write('%s %s %s' % ('*KIndex*   ', know_index, '\n'))
#=========fine istruzioni aggiunte da Roberto Bello 29/02/2012==============
arch_cv_file.close()

# writing out catalog file 
testata_cat1 = []
testata_cat1.append("*Group*")
arch_output_file = open(arch_output, 'w')
ic= 0
while ic < icx:
  testata_cat1.append(testata_cat[ic])
  ic += 1
ic= 0
while ic < len(testata_cat1):
  arch_output_file.write('%s %s ' % (testata_cat1[ic], " "*(15-len(testata_cat1[ic]))))
  ic += 1
arch_output_file.write('%s' % ("\n"))
index = 0 
while index < len(arr_orig):
  riga_orig = arr_orig[index]
  ic = 0
  while ic < len(riga_orig):
    if not(isinstance(riga_orig[ic],str)):
      riga_orig[ic] = str(riga_orig[ic])
    ic += 1   
  # place before 0 if gr / sg < 10 
  gr1 = str(mtchx[index])
  if mtchx[index] < 10:
    gr1 = '0' + str(mtchx[index])
  sg1 = str(mtchy[index])
  if mtchy[index] < 10: 
    sg1 = '0' + str(mtchy[index])
  arr_rig0 = "G_" + gr1 + "_" + sg1 + " "*8
  arch_output_file.write('%s ' % (arr_rig0))
  ic= 0
  while ic < len(riga_orig):
    arch_output_file.write('%s %s ' % (riga_orig[ic], " "*(15-len(riga_orig[ic]))))
    ic += 1
  arch_output_file.write('%s' % ("\n"))
  index += 1 
testata_cat1 = []
testata_cat1.append("*Group*")
testata_cat1.append("*RecNum*")
arch_sort_file = open(arch_sort, 'w')
ic= 0
while ic < icx:
  testata_cat1.append(testata_cat[ic])
  ic += 1
ic= 0
while ic < len(testata_cat1):
  arch_sort_file.write('%s %s ' % (testata_cat1[ic], " "*(15-len(testata_cat1[ic]))))
  ic += 1
arch_sort_file.write('%s' % ("\n"))
index = 0 
while index < len(arr_grsg_c):
  riga_grsg = arr_grsg_c[index].split()
  ic = 0
  while ic < len(riga_grsg):
    val_txt = riga_grsg[ic]
    val_txt = val_txt[0:13]
    arch_sort_file.write('%s %s ' % (val_txt, " "*(15-len(val_txt))))
    ic += 1
  if index < len(arr_grsg_c) - 1:
    arch_sort_file.write('%s' % ("\n"))
  index += 1 
arch_sort_file.close()

# writing out catalog and sorted file
arr_outsrt = []
index = 0
while index < len(arr_orig):
  riga_sort = []
  # place before 0 if gr / sg < 10 
  gr1 = str(mtchx[index])
  if mtchx[index] < 10:
    gr1 = '0' + str(mtchx[index])
  sg1 = str(mtchy[index])
  if mtchy[index] < 10: 
    sg1 = '0' + str(mtchy[index])
  riga_sort.append("G_" + gr1 + "_" + sg1)
  ic = 0
  riga_orig = arr_orig[index]
  while ic < len(riga_orig):
    val_riga = riga_orig[ic]
    riga_sort.append(val_riga)
    ic += 1
  arr_outsrt.append(riga_sort)
  index += 1

for line in arr_outsrt:
  line = "".join(line)  

arr_outsrt.sort()

testata_srt = []
testata_srt.append("*Group*")
arch_outsrt_file = open(arch_outsrt, 'w')
ic= 0
while ic < icx:
  testata_srt.append(testata_orig[ic])
  ic += 1
ic= 0
while ic < len(testata_srt):
  arch_outsrt_file.write('%s %s' % (testata_srt[ic], " "*(15-len(testata_srt[ic]))))
  ic += 1
arch_outsrt_file.write('%s' % ("\n"))
index = 0 
key_gruppo = "" 
while index < len(arr_outsrt):
  riga_sort = arr_outsrt[index] 
  index_c = 0 
  while index_c < len(riga_sort):
    if index_c == 0: 
      if riga_sort[0] != key_gruppo:
        # arch_outsrt_file.write('%s ' % ("\n"))
        key_gruppo = riga_sort[0] 
    valore = riga_sort[index_c] 
    arch_outsrt_file.write('%s %s' % (valore, " "*(15-len(valore))))
    index_c += 1 
  if index < len(arr_grsg_c) - 1:
    arch_outsrt_file.write('%s' % ("\n"))
  index += 1 
arch_outsrt_file.close()

print("###############################################################################")
print("# KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM)                 #")
print("# by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED)                 #")
print("# Language used: PYTHON                                                       #")
print("###############################################################################") 

arch_log_file = open(arch_log, 'w')
arch_log_file.write("%s %s" % ("############################################################################", "\n"))
arch_log_file.write("%s %s" % ("# KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM)              #", "\n")) 
arch_log_file.write("%s %s" % ("# by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED)              #", "\n")) 
arch_log_file.write("%s %s" % ("# Language used: PYTHON                     .                              #", "\n")) 
arch_log_file.write("%s %s" % ("############################################################################", "\n")) 
arch_log_file.write("%s %s %s" % ("Input File                                        -> ", file_input, "\n"))
arch_log_file.write("%s %s %s" % ("Numer of Groups (3 - 20)                          -> ", str(gruppi_num), "\n")) 
arch_log_file.write("%s %s %s" % ("Normalization (Max, Std, None)                    -> ", tipo_norm, "\n")) 
arch_log_file.write("%s %s %s" % ("Start Value of alpha (from 1.8 to 0.9)            -> ", str(alpha_max), "\n")) 
arch_log_file.write("%s %s %s" % ("End Value of alpha (from 0.5 to 0.0001)           -> ", str(alpha_min), "\n")) 
arch_log_file.write("%s %s %s" % ("Decreasing step of alpha (from 0.1 to 0.001)      -> ", str(alpha_step), "\n")) 
arch_log_file.write("%s"       % ("=========================OUTPUT=======================================================\n")) 
arch_log_file.write("%s %s %s" % ("Output File Catalog.original     ", arch_output, "\n"))
arch_log_file.write("%s %s %s" % ("Output File Catalog.sort         ", arch_outsrt, "\n")) 
arch_log_file.write("%s %s %s" % ("Output File Summary sort         ", arch_sort, "\n")) 
arch_log_file.write("%s %s %s" % ("Output File Matrix Catal.        ", arch_catal, "\n")) 
arch_log_file.write("%s %s %s" % ("Output File Means, STD, CV.      ", arch_medsd, "\n"))
arch_log_file.write("%s %s %s" % ("Output File CV of the Groups     ", arch_cv, "\n"))
arch_log_file.write("%s %s %s" % ("Output File Training Grid        ", arch_grid, "\n")) 
arch_log_file.write("%s %s %s" % ("Output File Run Parameters       ", arch_log, "\n"))
#=========istruzioni aggiunte Roberto Bello 29/02/2012======================
know_index = str(1.0 - float(cv_media_gen_after) / float(str_med_cv_gen))	
know_index = know_index[0:6]
arch_log_file.write('%s %s %s' % ('*KIndex*   ', know_index, '\n'))
#=========fine istruzioni aggiunte da Roberto Bello 29/02/2012==============

min_err_txt = "%12.3f" % min_err      # format 8 integer and 3 decimals 
alpha_txt  = "%12.5f" % alpha         # format 6 integer and 5 decimals 
alpha_min_txt = "%12.5f" % alpha_min  # format 6 integer and 5 decimals 

print 
if min_err == 1000000000.000:
  print("Oops! No result. Try again with new alpha parameters")
print 
print ("EPOCH " + str(min_epok -1) + "   WITH MIN ERROR " + min_err_txt + 
  " starting alpha " + alpha_min_txt + "   ending alpha " + alpha_txt + 
  " Iterations " + str(iter) + " Total Epochs " + str(ne - 1))
print 
print 'Output File Catalog.original ' + arch_output 
print 'Output File Catalog.sort     ' + arch_outsrt 
print 'Output File Summary sort     ' + arch_sort 
print 'Output File Matrix Catal.    ' + arch_catal 
print 'Output File Means, STD, CV.  ' + arch_medsd 
print 'Output File CV of the Groups ' + arch_cv 
print 'Output File Training Grid    ' + arch_grid 
print 'Output File Run Parameters   ' + arch_log 
print 'CV before Catalog            ' + str_med_cv_gen
print 'CV after Catalog             ' + cv_media_gen_after
know_index = str(1.0 - float(cv_media_gen_after) / float(str_med_cv_gen))
know_index = know_index[0:6]
print 'Knowledge Index              ' + know_index
print 

# Elapsed time
t1 = datetime.datetime.now()
elapsed_time = t1 - t0
print "Elapsed time (seconds)   :   " + str(elapsed_time.seconds)
print
Recipe 578853 revision 1

History

Accounts

Code Recipes

Feedback & Information

ActiveState