Giter Club home page Giter Club logo

Comments (7)

idroz avatar idroz commented on May 18, 2024

Hi - I think that it should be possible, as long as the data is appropriately shaped. Would you be able to provide a toy example?

from ivis.

mojovski avatar mojovski commented on May 18, 2024

Hi @idroz ,
thank you for your prompt response!
I have uploaded the file to
https://drive.google.com/open?id=1qo2R8Rz4dWvX9nMLQc50AVc1tb-PCZPg

It is a pickle file, with a dictionary inside {'X':[...], 'Y':[...]}
X has dimensions (N, T, f) with N as the number of samples, T: number of time steps, f: as the feature dimension at a time stamp.

Here is the file I am using right now to read the data and to apply ivis on a concatenated version.

"""
experiments with ivis
"""
import sys
import numpy as np
import matplotlib
##set headless mode
#matplotlib.use('Agg')
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

import pickle
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import logging
import argparse

from sklearn.preprocessing import MinMaxScaler
from collections.abc import Iterable

recfmt = '(%(threadName)s) %(asctime)s.%(msecs)03d %(levelname)s %(filename)s:%(lineno)d %(message)s'
timefmt = '%y%m%d_%H:%M:%S'
logging.basicConfig(filename=time.strftime("log/visualize_embedding.log"), #%y%m%d_%H%M%S.log"),
                        filemode="w",
                        level=logging.INFO,
                        format=recfmt, datefmt=timefmt)


parser = argparse.ArgumentParser()
parser.add_argument("xy_file", help="The X and Y data to be used. X:[[...], [...], ...], Y:[...]")
args = parser.parse_args()




def binarizeY(y):
	yc=[]
	for yi in y:
		if yi<0.1:
			yc.append(False)
		else:
			yc.append(True)
	return np.array(yc)

def embedClassical(x,ygt, do_scaling=True, ndim=2):

	xi=x
	if do_scaling:
		xi= MinMaxScaler().fit_transform(x)

	
	ygt=np.array(ygt)

	print("x.shape: "+str(xi.shape))
	print("y.shape: "+str(ygt.shape))
	print("starting embedding process")

	#X_embedded = TSNE(n_components=2).fit_transform(x, ygt)
	#X_embedded = LocallyLinearEmbedding(n_neighbors=5, method='modified').fit_transform(x,ygt)
	#X_embedded = SpectralEmbedding().fit_transform(x, ygt)
	#pca=PCA(n_components=2).fit(xi, ygt)
	#print("pca explains: "+str(pca.explained_variance_))
	X_embedded = PCA(n_components=ndim).fit_transform(xi, ygt)

	return X_embedded

def embedDeep(x,ygt, ndim=2, do_scaling=False, epochs=400):
	from tensorflow.python.keras import utils
	from ivis import Ivis
	xi=x
	#print("x: "+str(x))
	if do_scaling:
		xi= MinMaxScaler().fit_transform(x)

	#see https://keras.io/metrics/ for more supervision metrics
	model = Ivis(embedding_dims=ndim, k=5, epochs=epochs)#, supervision_weight=0.01)#,  supervision_metric='mae')
	model.fit(xi, ygt)
	embeddings=model.transform(xi)
	
	y_pred = model.score_samples(xi)
	print("pred err mean: "+str(np.mean(y_pred)))
	return embeddings



def plotEmbedding(X_embedded, ygt, title="", bin_colors=True, colormap_name="jet"):
	y_colors=ygt#binarizeY(ygt)
	if bin_colors:
		y_colors=binarizeY(ygt)
	print("x_e.shape:"+str(X_embedded.shape))
	print("y_c.shape: "+str(y_colors.shape))

	sc=plt.scatter(X_embedded[:,0], X_embedded[:,1], c=y_colors, cmap= plt.get_cmap(colormap_name))#, y_colors)
	plt.grid()
	plt.colorbar(sc)
	#plt.savefig("./log/vis_embedded.pdf", bbox_inches='tight', format='pdf')
	plt.savefig("./log/vis_embedded_"+title+".png", bbox_inches='tight')
	plt.show()



def classificationTest(x,y, do_scaling=False):
	from sklearn.linear_model import LogisticRegression
	from sklearn import tree, svm

	from sklearn.metrics import confusion_matrix, average_precision_score, roc_auc_score, classification_report

	xe=embedDeep(x,y, do_scaling= do_scaling, epochs=500)
	y_bin=binarizeY(y)
	#clf=LogisticRegression(solver="lbfgs").fit(xe, y_bin)
	#clf=tree.DecisionTreeClassifier(min_samples_split=10).fit(xe,y_bin)
	clf = svm.SVC(probability=True).fit(xe, y_bin)

	pred_labels = clf.predict(xe)
	proba = clf.predict_proba(xe)
	
	print(classification_report(y_bin, pred_labels))

	print('Confusion Matrix')
	print(confusion_matrix(y_bin, pred_labels))
	print('Average Precision: '+str(average_precision_score(y_bin, proba[:, 1])))
	print('ROC AUC: '+str(roc_auc_score(y_bin, pred_labels)))

	plotSamples(xe, y, title="classification")

	generateRegressionField(xe, y, clf, n=350, title="classification")


def flatten1(lst):
	out=[]
	for li in lst:
		if not isinstance(li, Iterable):
			out.append(li)
		else:
			for lii in li:
				out.append(lii)
	return out


def treeRegressionTest(x,y, do_scaling=False):
	from sklearn import tree
	from sklearn import linear_model
	from sklearn import svm
	xe=embedDeep(x,y, ndim=2, do_scaling= do_scaling, epochs=305)
	#xe=embedClassical(x,y, ndim=2, do_scaling= do_scaling)
	clf = tree.DecisionTreeRegressor(min_samples_split=10)#max_depth=4)
	#clf=linear_model.Ridge(alpha=.5)
	#clf = svm.SVR()

	n2=int(len(xe)/2)
	xe1=xe[:n2]
	y1=y[:n2]

	xe2=xe[n2:]
	y2=y[n2:]
	clf = clf.fit(xe1, y1)
	score=clf.score(xe2, y2)
	
	print("Score with split: "+str(score))

	clf = clf.fit(xe, y)
	score=clf.score(xe, y)
	print("Score w/o split: "+str(score))

	mnx=np.min(xe[:,0])
	mxx=np.max(xe[:,0])
	xnoisy=xe+np.random.rand(*xe.shape)*0.05*(mxx-mnx)
	score=clf.score(xnoisy, y)
	print("Score w/o split on xnoisy: "+str(score))

	plotSamples(xe, y, title="regression")

	generateRegressionField(xe, y, clf, n=350, title="regression")

def plotSamples(x,y, title=""):
	plt.scatter(x[:,0], x[:,1], c=y)
	plt.colorbar()
	plt.savefig("./log/vis_xy_embedding_scatter"+title+".png")
	plt.show()


def generateRegressionField(x,y, clf, n, title=""):
	"""
	generates an image of the regression results of the clf
	"""
	data=np.zeros((n,n))
	mnx=np.min(x[:,0])
	mxx=np.max(x[:,0])
	xnoisy=x+np.random.rand(*x.shape)*0.05*(mxx-mnx)
	

	mny=np.min(x[:,1])
	mxy=np.max(x[:,1])
	
	idr=-1
	for ri in np.linspace(mny, mxy, n):
		idr+=1
		idc=-1
		for ci in np.linspace(mnx, mxx, n):
			idc+=1
			vi=clf.predict_proba([[ri,ci]])[0][0]
			data[idr, idc]=vi

	fig, (ax, ax2) = plt.subplots(2, sharex=False, sharey=False, figsize=(12, 5))

	ax.imshow(np.flip(data, axis=0), alpha=1.0)
	ax.set_title("regression reproduced")
	#plt.colorbar()
	#plt.scatter((x[:,0]-mnx)/(mxx-mnx)*n, (x[:,1]-mny)/(mxy-mny)*n, c=y)
	#plt.show()

	ax2.plot(y, color="black")
	ypred=clf.predict(xnoisy)
	ax2.plot(ypred, color="green")
	ax2.set_title("y")
	ax2.grid()
	plt.savefig("./log/vis_xy_embedding_space"+title+".png")

	plt.show()



def seq2vec(x):
	"""
	produces a single vector from F=(f[0], ... f[k]...f[T]) by concatenation
	:param x: a 3d list (N,T,f) with N: number of samples, T: number of time steps, f: feature dim
	:return: 2d list
	"""
	out=[]
	for xi in x: #N
		vt=flatten1(xi)
		out.append(vt)
	return out


print("Loading file: "+str(args.xy_file))
data=pickle.load( open( args.xy_file, "rb" ) )
ygt  = np.array(data['Y'])

x=seq2vec(data['X'])
#x=np.array([np.array(xi) for xi in x]) #this does not work if the sublists have varying lengths
#enforce same length of features. there is still something dirty..
length = max(map(len, x))
if length!=min(map(len,x)):
	print("WARNING! The sublists have varying lengths!")
x=np.array([xi+[0.]*(length-len(xi)) for xi in x])

#pure embedding
#----------------
#x_embedded=embedClassical(x, ygt,  do_scaling=False)
#x_embedded=embedDeep(x,ygt, do_scaling=True)
#plotEmbedding(x_embedded, ygt)


#embedding + some classification
#--------------
classificationTest(x, ygt, do_scaling=True)
#treeRegressionTest(x,ygt, do_scaling=True)

sorry, for the long file paste. The project in not on github...

And thanks alot for your support.

from ivis.

idroz avatar idroz commented on May 18, 2024

Thanks for the code - it's pretty cool that you're using ivis as a classifier/regressor!

There are only a few tweaks that need to be done to get ivis to train on your data.

1. Classification vs. Regression
Supervised ivis doesn't impose a format on the response variable Y. Instead, the default option is to run a classifier. If you specify supervision_metric='mae' (or some other regression loss supported by keras), ivis will run in regression mode. The embedDeep method can be modified as:

def embedDeep(x, ygt, ndim=2, do_scaling=False, method='classification'):
    from tensorflow.python.keras import utils
    from ivis import Ivis
    xi=x
    #print("x: "+str(x))
    if do_scaling:
        xi= MinMaxScaler().fit_transform(x)

    #see https://keras.io/metrics/ for more supervision metrics
    if method=='classification':
        model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5)
    else:
        model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5, supervision_metric='mae', supervision_weight=0.9)
    
    model.fit(xi, ygt)
    embeddings=model.transform(xi)
    
    y_pred = model.score_samples(xi)
    print("pred err mean: "+str(np.mean(y_pred)))
    return embeddings

2. Label binarisation
ivis works with numpy arrays, and we can binarise Y as:

def binarizeY(y):    
    return np.where(np.array(y)<0.1, 0, 1)

Run your binariser before invoking ivis.fit method in classificationTest:

y_bin=binarizeY(y)
xe=embedDeep(x, y_bin, do_scaling= do_scaling)

3. Hyperparameter tuning
I would recommend changing default ivis parameters to:
k=15, model='maaten', and n_epochs_without_progress=5

This will converge a lot faster and won't overfit the data. We generally recommend leaving the epochs parameter set to the default value and tuning the early stopping parameter via n_epochs_without_progress

supervision_weight is also a very interesting parameter as it controls the degree to which classifier focuses on supervision vs. unsupervised dimensionality reduction. Setting this value higher, e.g. 0.90 or 0.95 will produce desirable classifiers.

If you're interested in classification rather than visualisation, increasing embedding_dims could also be useful. Increasing dimensionality to 50 produces a reasonable AUC:

          precision    recall  f1-score   support

       0       0.85      0.92      0.89       222
       1       0.92      0.85      0.88       233

accuracy                           0.88       455

macro avg 0.89 0.88 0.88 455
weighted avg 0.89 0.88 0.88 455

Confusion Matrix
[[205 17]
[ 36 197]]
Average Precision: 0.966985040236684
ROC AUC: 0.8844584928275915

The jury is still out a little bit on how to best scale the data - MinMaxScaler, StandardScaler, or no scaling seem to work well depending on the dataset!

from ivis.

mojovski avatar mojovski commented on May 18, 2024

from ivis.

idroz avatar idroz commented on May 18, 2024

I think what you have is fine. You could also look into Keras' pad_sequences method, which I think does pretty much what you've written.

Ivis uses KNN retrieval as the first step in the algorithm, which relies on a 2-D array and doesn't support nested multidimensional lists.

Theoretically, you could write your own base neural network using Keras layers and pass that into model hyperparameter. Something like:

Input -> Reshape -> 1DConv -> MaxPooling -> Dense

You could then pass the 2D array into ivis and let your base network do the rest. It may be an overkill, as maaten architecture (3 Dense layers 500-500-2000) will work well in most cases.

from ivis.

idroz avatar idroz commented on May 18, 2024

@mojovski Hope this helped you a bit. Will close the issue, but feel free to re-open if anything else is outstanding.

from ivis.

mojovski avatar mojovski commented on May 18, 2024

Thanks a lot! Will be able to continue the work on this in 2 weeks and, if appropriate, post some results.

from ivis.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.