Comments (7)
Hi - I think that it should be possible, as long as the data is appropriately shaped. Would you be able to provide a toy example?
from ivis.
Hi @idroz ,
thank you for your prompt response!
I have uploaded the file to
https://drive.google.com/open?id=1qo2R8Rz4dWvX9nMLQc50AVc1tb-PCZPg
It is a pickle file, with a dictionary inside {'X':[...], 'Y':[...]}
X has dimensions (N, T, f) with N as the number of samples, T: number of time steps, f: as the feature dimension at a time stamp.
Here is the file I am using right now to read the data and to apply ivis on a concatenated version.
"""
experiments with ivis
"""
import sys
import numpy as np
import matplotlib
##set headless mode
#matplotlib.use('Agg')
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
import pickle
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import logging
import argparse
from sklearn.preprocessing import MinMaxScaler
from collections.abc import Iterable
recfmt = '(%(threadName)s) %(asctime)s.%(msecs)03d %(levelname)s %(filename)s:%(lineno)d %(message)s'
timefmt = '%y%m%d_%H:%M:%S'
logging.basicConfig(filename=time.strftime("log/visualize_embedding.log"), #%y%m%d_%H%M%S.log"),
filemode="w",
level=logging.INFO,
format=recfmt, datefmt=timefmt)
parser = argparse.ArgumentParser()
parser.add_argument("xy_file", help="The X and Y data to be used. X:[[...], [...], ...], Y:[...]")
args = parser.parse_args()
def binarizeY(y):
yc=[]
for yi in y:
if yi<0.1:
yc.append(False)
else:
yc.append(True)
return np.array(yc)
def embedClassical(x,ygt, do_scaling=True, ndim=2):
xi=x
if do_scaling:
xi= MinMaxScaler().fit_transform(x)
ygt=np.array(ygt)
print("x.shape: "+str(xi.shape))
print("y.shape: "+str(ygt.shape))
print("starting embedding process")
#X_embedded = TSNE(n_components=2).fit_transform(x, ygt)
#X_embedded = LocallyLinearEmbedding(n_neighbors=5, method='modified').fit_transform(x,ygt)
#X_embedded = SpectralEmbedding().fit_transform(x, ygt)
#pca=PCA(n_components=2).fit(xi, ygt)
#print("pca explains: "+str(pca.explained_variance_))
X_embedded = PCA(n_components=ndim).fit_transform(xi, ygt)
return X_embedded
def embedDeep(x,ygt, ndim=2, do_scaling=False, epochs=400):
from tensorflow.python.keras import utils
from ivis import Ivis
xi=x
#print("x: "+str(x))
if do_scaling:
xi= MinMaxScaler().fit_transform(x)
#see https://keras.io/metrics/ for more supervision metrics
model = Ivis(embedding_dims=ndim, k=5, epochs=epochs)#, supervision_weight=0.01)#, supervision_metric='mae')
model.fit(xi, ygt)
embeddings=model.transform(xi)
y_pred = model.score_samples(xi)
print("pred err mean: "+str(np.mean(y_pred)))
return embeddings
def plotEmbedding(X_embedded, ygt, title="", bin_colors=True, colormap_name="jet"):
y_colors=ygt#binarizeY(ygt)
if bin_colors:
y_colors=binarizeY(ygt)
print("x_e.shape:"+str(X_embedded.shape))
print("y_c.shape: "+str(y_colors.shape))
sc=plt.scatter(X_embedded[:,0], X_embedded[:,1], c=y_colors, cmap= plt.get_cmap(colormap_name))#, y_colors)
plt.grid()
plt.colorbar(sc)
#plt.savefig("./log/vis_embedded.pdf", bbox_inches='tight', format='pdf')
plt.savefig("./log/vis_embedded_"+title+".png", bbox_inches='tight')
plt.show()
def classificationTest(x,y, do_scaling=False):
from sklearn.linear_model import LogisticRegression
from sklearn import tree, svm
from sklearn.metrics import confusion_matrix, average_precision_score, roc_auc_score, classification_report
xe=embedDeep(x,y, do_scaling= do_scaling, epochs=500)
y_bin=binarizeY(y)
#clf=LogisticRegression(solver="lbfgs").fit(xe, y_bin)
#clf=tree.DecisionTreeClassifier(min_samples_split=10).fit(xe,y_bin)
clf = svm.SVC(probability=True).fit(xe, y_bin)
pred_labels = clf.predict(xe)
proba = clf.predict_proba(xe)
print(classification_report(y_bin, pred_labels))
print('Confusion Matrix')
print(confusion_matrix(y_bin, pred_labels))
print('Average Precision: '+str(average_precision_score(y_bin, proba[:, 1])))
print('ROC AUC: '+str(roc_auc_score(y_bin, pred_labels)))
plotSamples(xe, y, title="classification")
generateRegressionField(xe, y, clf, n=350, title="classification")
def flatten1(lst):
out=[]
for li in lst:
if not isinstance(li, Iterable):
out.append(li)
else:
for lii in li:
out.append(lii)
return out
def treeRegressionTest(x,y, do_scaling=False):
from sklearn import tree
from sklearn import linear_model
from sklearn import svm
xe=embedDeep(x,y, ndim=2, do_scaling= do_scaling, epochs=305)
#xe=embedClassical(x,y, ndim=2, do_scaling= do_scaling)
clf = tree.DecisionTreeRegressor(min_samples_split=10)#max_depth=4)
#clf=linear_model.Ridge(alpha=.5)
#clf = svm.SVR()
n2=int(len(xe)/2)
xe1=xe[:n2]
y1=y[:n2]
xe2=xe[n2:]
y2=y[n2:]
clf = clf.fit(xe1, y1)
score=clf.score(xe2, y2)
print("Score with split: "+str(score))
clf = clf.fit(xe, y)
score=clf.score(xe, y)
print("Score w/o split: "+str(score))
mnx=np.min(xe[:,0])
mxx=np.max(xe[:,0])
xnoisy=xe+np.random.rand(*xe.shape)*0.05*(mxx-mnx)
score=clf.score(xnoisy, y)
print("Score w/o split on xnoisy: "+str(score))
plotSamples(xe, y, title="regression")
generateRegressionField(xe, y, clf, n=350, title="regression")
def plotSamples(x,y, title=""):
plt.scatter(x[:,0], x[:,1], c=y)
plt.colorbar()
plt.savefig("./log/vis_xy_embedding_scatter"+title+".png")
plt.show()
def generateRegressionField(x,y, clf, n, title=""):
"""
generates an image of the regression results of the clf
"""
data=np.zeros((n,n))
mnx=np.min(x[:,0])
mxx=np.max(x[:,0])
xnoisy=x+np.random.rand(*x.shape)*0.05*(mxx-mnx)
mny=np.min(x[:,1])
mxy=np.max(x[:,1])
idr=-1
for ri in np.linspace(mny, mxy, n):
idr+=1
idc=-1
for ci in np.linspace(mnx, mxx, n):
idc+=1
vi=clf.predict_proba([[ri,ci]])[0][0]
data[idr, idc]=vi
fig, (ax, ax2) = plt.subplots(2, sharex=False, sharey=False, figsize=(12, 5))
ax.imshow(np.flip(data, axis=0), alpha=1.0)
ax.set_title("regression reproduced")
#plt.colorbar()
#plt.scatter((x[:,0]-mnx)/(mxx-mnx)*n, (x[:,1]-mny)/(mxy-mny)*n, c=y)
#plt.show()
ax2.plot(y, color="black")
ypred=clf.predict(xnoisy)
ax2.plot(ypred, color="green")
ax2.set_title("y")
ax2.grid()
plt.savefig("./log/vis_xy_embedding_space"+title+".png")
plt.show()
def seq2vec(x):
"""
produces a single vector from F=(f[0], ... f[k]...f[T]) by concatenation
:param x: a 3d list (N,T,f) with N: number of samples, T: number of time steps, f: feature dim
:return: 2d list
"""
out=[]
for xi in x: #N
vt=flatten1(xi)
out.append(vt)
return out
print("Loading file: "+str(args.xy_file))
data=pickle.load( open( args.xy_file, "rb" ) )
ygt = np.array(data['Y'])
x=seq2vec(data['X'])
#x=np.array([np.array(xi) for xi in x]) #this does not work if the sublists have varying lengths
#enforce same length of features. there is still something dirty..
length = max(map(len, x))
if length!=min(map(len,x)):
print("WARNING! The sublists have varying lengths!")
x=np.array([xi+[0.]*(length-len(xi)) for xi in x])
#pure embedding
#----------------
#x_embedded=embedClassical(x, ygt, do_scaling=False)
#x_embedded=embedDeep(x,ygt, do_scaling=True)
#plotEmbedding(x_embedded, ygt)
#embedding + some classification
#--------------
classificationTest(x, ygt, do_scaling=True)
#treeRegressionTest(x,ygt, do_scaling=True)
sorry, for the long file paste. The project in not on github...
And thanks alot for your support.
from ivis.
Thanks for the code - it's pretty cool that you're using ivis as a classifier/regressor!
There are only a few tweaks that need to be done to get ivis to train on your data.
1. Classification vs. Regression
Supervised ivis doesn't impose a format on the response variable Y. Instead, the default option is to run a classifier. If you specify supervision_metric='mae'
(or some other regression loss supported by keras), ivis will run in regression mode. The embedDeep
method can be modified as:
def embedDeep(x, ygt, ndim=2, do_scaling=False, method='classification'):
from tensorflow.python.keras import utils
from ivis import Ivis
xi=x
#print("x: "+str(x))
if do_scaling:
xi= MinMaxScaler().fit_transform(x)
#see https://keras.io/metrics/ for more supervision metrics
if method=='classification':
model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5)
else:
model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5, supervision_metric='mae', supervision_weight=0.9)
model.fit(xi, ygt)
embeddings=model.transform(xi)
y_pred = model.score_samples(xi)
print("pred err mean: "+str(np.mean(y_pred)))
return embeddings
2. Label binarisation
ivis works with numpy arrays, and we can binarise Y as:
def binarizeY(y):
return np.where(np.array(y)<0.1, 0, 1)
Run your binariser before invoking ivis.fit
method in classificationTest
:
y_bin=binarizeY(y)
xe=embedDeep(x, y_bin, do_scaling= do_scaling)
3. Hyperparameter tuning
I would recommend changing default ivis parameters to:
k=15
, model='maaten'
, and n_epochs_without_progress=5
This will converge a lot faster and won't overfit the data. We generally recommend leaving the epochs
parameter set to the default value and tuning the early stopping parameter via n_epochs_without_progress
supervision_weight
is also a very interesting parameter as it controls the degree to which classifier focuses on supervision vs. unsupervised dimensionality reduction. Setting this value higher, e.g. 0.90 or 0.95 will produce desirable classifiers.
If you're interested in classification rather than visualisation, increasing embedding_dims
could also be useful. Increasing dimensionality to 50 produces a reasonable AUC:
precision recall f1-score support 0 0.85 0.92 0.89 222 1 0.92 0.85 0.88 233 accuracy 0.88 455
macro avg 0.89 0.88 0.88 455
weighted avg 0.89 0.88 0.88 455Confusion Matrix
[[205 17]
[ 36 197]]
Average Precision: 0.966985040236684
ROC AUC: 0.8844584928275915
The jury is still out a little bit on how to best scale the data - MinMaxScaler, StandardScaler, or no scaling seem to work well depending on the dataset!
from ivis.
from ivis.
I think what you have is fine. You could also look into Keras' pad_sequences
method, which I think does pretty much what you've written.
Ivis uses KNN retrieval as the first step in the algorithm, which relies on a 2-D array and doesn't support nested multidimensional lists.
Theoretically, you could write your own base neural network using Keras layers and pass that into model
hyperparameter. Something like:
Input -> Reshape -> 1DConv -> MaxPooling -> Dense
You could then pass the 2D array into ivis and let your base network do the rest. It may be an overkill, as maaten
architecture (3 Dense layers 500-500-2000) will work well in most cases.
from ivis.
@mojovski Hope this helped you a bit. Will close the issue, but feel free to re-open if anything else is outstanding.
from ivis.
Thanks a lot! Will be able to continue the work on this in 2 weeks and, if appropriate, post some results.
from ivis.
Related Issues (20)
- `NotFittedError` after caching and reloading fitted `Ivis` instance HOT 2
- Suggest implementing `predict_proba` and `predict` methods for Ivis object. HOT 1
- How does ivis compare to UMAP? HOT 2
- Add conda-forge package
- About scaling HOT 2
- `KeyError` followed by `joblib.externals.loky.process_executor.BrokenProcessPool` when using `sklearn.model_selection.GridSearchCV` with `n_jobs != 1` HOT 3
- One of the unit tests (knn_retrieval) can fail (machine dependent?) HOT 1
- OSError HOT 1
- attempt to apply non-function HOT 9
- Extremely slow extraction of KNN neighbours on 100k samples HOT 4
- InternalError: Graph execution error: HOT 4
- 2D visulization of crowded cluster with ivis HOT 1
- model_save: optimizer is not compatible with pickle HOT 4
- How to get stable results? HOT 4
- Ivis is not able to run inference on a sparse matrix
- Reproducibility HOT 2
- `chunk_size` in knn set to 0 HOT 2
- Ivis seems to provoke errors when composing a sklearn.pipeline.Pipeline passed to sklearn.model_selection.GridSearchCV and executed in parallel HOT 10
- classification_weight Parameter HOT 2
- Meaning of "Observations" on https://bering-ivis.readthedocs.io/en/latest/hyperparameters.html HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from ivis.