Hi @trevorstephens ,
I am not sure if this is a bug, or the documentation is not correct focused refered to SymbolicTransformer.
I have done a show case of how SymbolicRegressor works and predicts well the equation that represents the dataset, while SymbolicTransformer does not work in the same way.
Starting with SymbolicRegressor, I have done a "easy" dataset to check if SymbolicRegressor give me the correct result and good metrics.
from gplearn.genetic import SymbolicRegressor
from sklearn import metrics
import pandas as pd
import numpy as np
# Load data
X = np.random.uniform(0,100,size=(100,3))
y = np.min(X[:,:2],axis=1)*X[:,2]
index = 80
X_train , y_train = X[:index,:], y[:index]
X_test , y_test = X[index:,:], y[index:]
function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
'abs', 'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan']
est_gp = SymbolicRegressor(population_size=5000,
generations=20, stopping_criteria=0.001,
function_set=function_set,
p_crossover=0.7, p_subtree_mutation=0.1,
p_hoist_mutation=0.05, p_point_mutation=0.1,
max_samples=0.9, verbose=1,
n_jobs=1,
parsimony_coefficient=0.01, random_state=0)
est_gp.fit(X_train, y_train)
print 'Score: ', est_gp.score(X_test, y_test), metrics.mean_absolute_error(y_test, est_gp.predict(X_test))
print est_gp._program
This example give us a perfect result and the MAE metrics is ~perfect as shows the output:
| Population Average | Best Individual |
---- ------------------------- ------------------------------------------ ----------
Gen Length Fitness Length Fitness OOB Fitness Time Left
0 11.81 8396.89543051 10 25.3022470326 26.608049431 35.35s
1 12.36 8904.35549713 8 20.0284767508 19.0994923956 37.34s
2 13.74 37263.312834 8 7.82583874247e-14 2.13162820728e-14 36.67s
Score: 1.0 5.71986902287e-14
abs(div(neg(X2), inv(min(X0, X1))))
However, SymbolicTransformer although the training works well, the transform does not work well.
See next same example to previous one but with SymbolicTransformer:
from gplearn.genetic import SymbolicRegressor,SymbolicTransformer
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import metrics
X = np.random.uniform(0,100,size=(100,3))
y = np.min(X[:,:2],axis=1)*X[:,2]
index = 80
X_train , y_train = X[:index,:], y[:index]
X_test , y_test = X[index:,:], y[index:]
# Linear model - Original features
est_lin = linear_model.Lars()
est_lin.fit(X_train, y_train)
print 'Lars(orig): ', est_lin.score(X_test, y_test), metrics.mean_absolute_error(y_test, est_lin.predict(X_test))
# Create added value features
function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
'abs', 'neg', 'inv', 'max', 'min']
gp = SymbolicTransformer(generations=20, population_size=2000,
hall_of_fame=100, n_components=10,
function_set=function_set,
parsimony_coefficient=0.0005,
max_samples=0.9, verbose=1,
random_state=0, n_jobs=3)
gp.fit(X_train, y_train)
gp_features = gp.transform(X)
# Linear model - Transformed features
newX = np.hstack((X, gp_features))
print 'newX: ', np.shape(newX)
est_lin = linear_model.Lars()
est_lin.fit(newX[:index,:], y_train)
print 'Lars(trans): ', est_lin.score(newX[index:,:], y_test), metrics.mean_absolute_error(y_test, est_lin.predict(newX[index:,:]))
# Linear model - "The" feature
newX = np.append(X, (np.min(X[:,:2],axis=1)*X[:,2]).reshape(-1,1), axis=1)
print 'newX: ', newX.shape
est_lin = linear_model.Lars()
est_lin.fit(newX[:index,:], y_train)
print 'Lars(trans): ', est_lin.score(newX[index:,:], y_test), metrics.mean_absolute_error(y_test, est_lin.predict(newX[index:,:]))
I use Lars from sklearn for avoid Ridge sparse weights, and find the best solution fast for this easy and exact example. As it can be seen on the results of this code (below), the features that are generated with transform, although during the fit fitness become perfect, the added transformed features seem to be worng. The problem does not come from Lars, as last example of Lars shows that adding "the feature" which is the target, the accuracy is perfetc.
X: (100, 3)
y: (100,)
Lars(orig): 0.850145084161 518.34496409
| Population Average | Best Individual |
---- ------------------------- ------------------------------------------ ----------
Gen Length Fitness Length Fitness OOB Fitness Time Left
0 14.62 0.349810294784 6 0.954248106272 0.939129495332 16.04s
1 16.01 0.601354215127 6 1.0 1.0 25.56s
newX: (100, 13)
Lars(trans): 0.83552794823 497.438879508
newX: (100, 4)
Lars(trans): 1.0 1.60411683936e-12
So I decided to see the fitted features created during the fit and some of them are perfect, however, the transform seems not to use them correctly on gp_features
created
>>>print 'Eq. of new features: ', gp.__str__()
mul(mul(neg(sqrt(min(neg(mul(mul(X1, X0), add(inv(log(abs(-0.575))), neg(mul(mul(X1, X0), sub(X2, 0.904)))))), X2))), sqrt(max(X2, X2))), X1),
div(min(div(abs(X0), log(0.901)), log(max(X2, -0.222))), X0),
mul(sub(X1, X0), mul(X1, X0)),
mul(X2, inv(X2)),
mul(mul(neg(sqrt(min(X0, X2))), add(neg(X0), min(X0, X2))), X1),
div(abs(mul(X0, X2)), inv(mul(mul(neg(sqrt(min(X0, X2))), mul(neg(X2), max(X1, X1))), X1))),
div(abs(mul(X0, X2)), inv(mul(0.640, mul(X1, X0)))),
div(abs(mul(X0, X2)), inv(sub(min(sqrt(log(max(X1, X2))), neg(sqrt(mul(X0, 0.424)))), mul(sub(min(sub(-0.603, 0.299), sub(0.063, X1)), neg(min(X1, -0.125))), mul(max(mul(X0, X2), sqrt(X0)), min(sub(X1, 0.570), log(0.341))))))),
mul(neg(mul(div(X2, -0.678), neg(X1))), div(sqrt(max(X2, X2)), min(X1, X0)))]
>>>
>>>df = pd.DataFrame(columns=['Gen','OOB_fitness','Equation'])
>>>for idGen in range(len(gp._programs)):
>>> for idPopulation in range(gp.population_size):
>>> if(gp._programs[idGen][idPopulation] != None):
>>> df = df.append({'fitness': value_fitness_, 'OOB_fitness': value_oobfitness_, 'Equation': str(gp._programs[-1][idPopulation])}, ignore_index=True)
>>>
>>>print 'Best of last Gen: '
>>>print df[df['Gen']==df['Gen'].max()].sort_values('OOB_fitness')
Best of last Gen:
Gen OOB_fitness Equation
1126 2.0 0.000000 add(0.944, sub(X0, X0))
952 2.0 0.000000 div(min(X2, X0), min(X2, X0))
1530 2.0 0.000000 min(inv(neg(abs(log(min(X1, 0.535))))), neg(su...
2146 2.0 0.000000 div(abs(mul(X0, X2)), inv(mul(mul(neg(sqrt(min...
2148 2.0 0.000000 div(min(add(-0.868, -0.285), X2), sqrt(sqrt(0....
2150 2.0 0.000000 sub(-0.603, 0.299)
2476 2.0 0.000000 min(min(max(X0, X2), add(-0.738, 0.612)), sqrt...
1601 2.0 0.000000 neg(min(X1, -0.125))
1271 2.0 0.000000 add(-0.504, 0.058)
1742 2.0 0.000000 add(inv(log(abs(-0.575))), inv(log(abs(-0.575))))
733 2.0 0.000000 abs(-0.575)
1304 2.0 0.000000 abs(sqrt(-0.758))
1630 2.0 0.000000 div(abs(mul(X0, X2)), inv(mul(max(X2, X2), add...
652 2.0 0.000000 log(0.341)
1708 2.0 0.000000 0.904
2262 2.0 0.000000 sqrt(-0.715)
1338 2.0 0.000000 mul(X2, sub(X1, X1))
826 2.0 0.000000 div(min(X2, add(sub(neg(sub(0.096, -0.886)), m...
1615 2.0 0.000000 abs(add(0.640, 0.766))
2415 2.0 0.000000 log(abs(-0.575))
1670 2.0 0.000000 min(X0, 0.657)
1644 2.0 0.000000 log(min(-0.524, X0))
2361 2.0 0.000000 0.944
785 2.0 0.000000 min(inv(log(abs(log(min(X1, 0.535))))), neg(mu...
2367 2.0 0.000000 abs(-0.911)
2249 2.0 0.000000 0.904
960 2.0 0.000000 inv(inv(-0.045))
955 2.0 0.000000 div(add(X1, X2), inv(sub(X2, X2)))
2397 2.0 0.000000 -0.125
1878 2.0 0.000000 div(min(X2, add(sub(neg(sub(0.096, -0.886)), m...
... ... ... ...
1103 2.0 0.997786 mul(X2, abs(sub(mul(X0, X1), add(X2, X0))))
2225 2.0 0.997790 mul(sub(min(log(div(X0, -0.717)), neg(sqrt(mul...
1890 2.0 0.998069 mul(sub(div(X2, 0.309), neg(X2)), sub(max(X2, ...
1704 2.0 0.998283 add(sub(log(min(add(0.769, X1), abs(X1))), sub...
1829 2.0 0.998284 add(inv(log(abs(-0.575))), neg(mul(mul(X1, X0)...
700 2.0 0.998345 add(sub(log(min(add(0.769, X1), abs(X1))), sub...
1770 2.0 0.998638 mul(add(min(X0, min(X1, X1)), X2), sqrt(abs(ab...
2344 2.0 0.998692 div(min(X2, add(sub(neg(sub(0.096, abs(-0.575)...
985 2.0 0.998793 sub(min(mul(sub(min(sqrt(log(max(X1, X2))), ne...
1634 2.0 0.998815 add(inv(log(abs(-0.575))), neg(mul(mul(X1, X0)...
1412 2.0 0.998945 mul(sub(min(sqrt(log(max(X1, X2))), neg(sqrt(m...
855 2.0 0.998965 add(inv(log(abs(X1))), neg(mul(mul(X1, X0), su...
839 2.0 0.998996 add(inv(abs(add(min(X0, min(X1, X1)), X2))), n...
1528 2.0 0.999066 add(sub(log(min(add(0.769, X1), abs(X1))), sub...
690 2.0 0.999875 add(sub(log(min(add(0.769, X1), abs(X1))), sub...
2047 2.0 0.999895 sub(min(neg(X1), div(X1, X2)), sub(min(abs(X1)...
1951 2.0 0.999921 sub(min(min(X2, X0), X2), mul(min(X1, X0), neg...
1981 2.0 0.999954 mul(X2, neg(neg(min(add(0.448, X0), sub(X1, -0...
2349 2.0 0.999954 sub(min(abs(X1), X2), mul(min(X1, X0), neg(X2)))
2364 2.0 0.999960 add(inv(log(abs(-0.575))), mul(X2, neg(neg(min...
2487 2.0 0.999971 sub(min(abs(X1), X2), mul(min(X1, X0), neg(X2)))
2056 2.0 0.999975 sub(min(abs(X1), X2), mul(min(X1, X0), neg(X2)))
1559 2.0 0.999976 mul(X2, neg(neg(min(add(0.448, X0), abs(X1)))))
975 2.0 0.999982 sub(min(abs(X1), X2), mul(min(X1, X0), neg(X2)))
2032 2.0 0.999992 sub(min(abs(X1), X2), mul(min(X1, X0), neg(X2)))
1288 2.0 1.000000 sub(min(div(-0.992, X2), X2), mul(min(X1, X0),...
2482 2.0 1.000000 sub(min(abs(inv(neg(X1))), X2), mul(min(X1, X0...
1776 2.0 1.000000 mul(min(mul(add(X0, X0), abs(log(X1))), min(ab...
2392 2.0 1.000000 mul(neg(X2), max(div(0.933, X0), min(X0, min(X...
1329 2.0 1.000000 mul(min(X1, X0), neg(X2))
[2000 rows x 3 columns]
Is this a bug? I am doing the same thing as explained on SymbolicTransformer example
bug