nuaa-al / alipy Goto Github PK

ALiPy: Active Learning in Python is an active learning python toolbox, which allows users to conveniently evaluate, compare and analyze the performance of active learning methods.

Home Page: http://parnec.nuaa.edu.cn/huangsj/alipy/

License: BSD 3-Clause "New" or "Revised" License

Python 100.00%

active-learning machine-learning python toolbox

alipy's People

Contributors

Stargazers

Watchers

Forkers

ningkp smartse tomarraj008 awesome-archive nothinglz amoliu barbecacov yyht drheli sx1616039 quebradawill ml-lab yhjohn163 dlwbm123 coolshan008 amitkumarj441 fengzhou4 evanzhu2013 1210xx songfgh enhui-huang houchaoqun lyzl2010 ssameerr mars-wei danielzhang111cn liuweiping2020 fmx789 homerj233 james-fu caijjnuaa we1l1n nnu-gisa liuwenhaha henrywoodotc medical-images-process sunyong2016 pkucss lcf000000 captain1986 juliussimonelli stevenbot zwj6 khle08 349214897 tututoo moomoofarm1 vitalyvels jing--li mingyates robotninjya xiangnanliu coolcoolhua zeng8280 highdxy somous-jhzhao shuixianhua bobchengyang ting2-wang vladperervenko b160413 jayceelee gaimjkp sunmanli sjyttkl fengjianshe missingboyzfy git163 haorand chanocy jgonsior chaoso mobying dililala cs911 sodiqadewole gxwangupc geog0521 arcral workindead fychf yuanmengzhixing yashalshakti kangzi noticeable hongxin001 peace-zy dumpmemory wxybdth jlsbssmnn sirius222 zoubs nuhhatipoglu linuswangg yotofu vishu26 poppybrown ariapoy minghsuanwu hzccaicai

alipy's Issues

QueryRandom strategy didn't work.

Thanks for your work. The most comprehensive AL package I've seen.

Issue:

from sklearn.datasets import load_iris,load_breast_cancer
from alipy.experiment.al_experiment import AlExperiment

import warnings
warnings.filterwarnings('ignore')

import copy
from sklearn.datasets import make_classification
from alipy import ToolBox
from alipy.query_strategy.query_labels import QueryInstanceGraphDensity, QueryInstanceQBC, \
    QueryInstanceQUIRE, QueryRandom, QueryInstanceUncertainty, QureyExpectedErrorReduction, QueryInstanceLAL

X, y = make_classification(n_samples=500, n_features=20, n_informative=2, n_redundant=2,
    n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0,
    hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)

alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)


def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = strategy.select(label_ind, unlab_ind, batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                  y_pred=pred,
                                                  performance_metric='accuracy_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    return saver

unc_result = []
qbc_result = []
random_result = []

for round in range(5):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)

    # Use pre-defined strategy
    unc = QueryInstanceUncertainty(X, y)
    qbc = QueryInstanceQBC(X, y)
    rnd = QueryRandom(X,y)

    unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
    qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))
    random_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))

analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')

analyser.add_method(method_name='QBC', method_results=qbc_result)
analyser.add_method(method_name='Unc', method_results=unc_result)
analyser.add_method(method_name='RANDOM', method_results=random_result)

print(analyser)
analyser.plot_learning_curves(title='Example of alipy', std_area=False)

Error is below:

| round | initially labeled data | number of queries | cost | Performance: |
|   0   |   35 (10.00% of all)   |         50        |  0   | 0.846 ± 0.02 |
| round | initially labeled data | number of queries | cost | Performance: |
|   0   |   35 (10.00% of all)   |         50        |  0   | 0.841 ± 0.01 |
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-15-5d0ada1f815d> in <module>
     70     unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
     71     qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))
---> 72     random_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))
     73 
     74 analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')

<ipython-input-15-5d0ada1f815d> in main_loop(alibox, strategy, round)
     35         # Select a subset of Uind according to the query strategy
     36         # Passing model=None to use the default model for evaluating the committees' disagreement
---> 37         select_ind = strategy.select(label_ind, unlab_ind, batch_size=1)
     38         label_ind.update(select_ind)
     39         unlab_ind.difference_update(select_ind)

TypeError: select() got multiple values for argument 'batch_size'

Time module has no (more) method clock()

Hello,
I am using Python 3.8 and it seems that the time.clock() method, used in the file alipy/utils/multi_thread.py (line 122, in __init__()) does not exist anymore.
I cannot execute the following code given in example:

from sklearn.datasets import load_iris
from alipy.experiment.al_experiment import AlExperiment

X, y = load_iris(return_X_y=True)
al = AlExperiment(X, y, stopping_criteria='num_of_queries', stopping_value=50,)
al.split_AL()
al.set_query_strategy(strategy="QueryInstanceUncertainty", measure='least_confident')
al.set_performance_metric('accuracy_score')
al.start_query(multi_thread=True)
al.plot_learning_curve()

(get the following error:

Traceback (most recent call last):
  File "/.../active_learning/main.py", line 46, in <module>
    main(sys.argv)
  File "/.../active_learning/main.py", line 37, in main
    al.start_query(multi_thread=True)
  File "/.../.local/lib/python3.8/site-packages/alipy/experiment/al_experiment.py", line 340, in start_query
    ace = aceThreading(self._X, self._y, self._train_idx, self._test_idx,
  File "/.../.local/lib/python3.8/site-packages/alipy/utils/multi_thread.py", line 112, in __init__
    self._start_time = time.clock()
AttributeError: module 'time' has no attribute 'clock'

[edit] But setting multi_thread parameter to False solves the issue. However, that means we cannot use multi threading in recent Python version...)

According to the doc of Python 3.5:

Deprecated since version 3.3: The behaviour of this function depends on the platform: use perf_counter() or process_time() instead, depending on your requirements, to have a well defined behaviour.

Is any update planned for compatibility with more recent Python version?

Thank you and best regards!

Potential errors in ALiPy/examples/AL_settings/query_instance.py?

Thanks so much for the awesome library!
I was trying to run the "query_instance.py", the only changes I made to the code are:
i) Increased the num_of_queries from 50 to 500
ii) I set the number of rounds (probably "folds" is the word that you would like to use) to 1
I only tried QBC and random methods, and you will find the plot from the attached.

It is observable that QBC does not outperform random, both are performing alike on the simulated data. Any thoughts on it?

更换数据集之后，运行出问题

您好，非常感谢Alipy的伟大工作，这是我见过的最全面的主动学习包。
在我准备运行AL for multi-label data示例代码的时候出了问题。
用iris数据集是可以运行的，但是当我换了一个数据集就没法运行了。
我现在的代码是这样的

dataset_filepath = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), 'scene')

X, mult_y = import_libsvm_sparse(dataset_filepath, True).format_sklearn()
mult_y[mult_y == 0] = -1

# X, y = load_iris(return_X_y=True)
# mlb = OneHotEncoder()
# mult_y = mlb.fit_transform(y.reshape((-1, 1)))
# mult_y = np.asarray(mult_y.todense())
# mult_y[mult_y == 0] = -1

alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)


def main_loop(alibox, round, strategy):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # base model
    model = LabelRankingModel()

    while len(label_ind) <= 120:
        # query and update
        select_labs = strategy.select(label_ind, unlab_ind)
        # use cost to record the amount of queried instance-label pairs
        if len(select_labs[0]) == 1:
            cost = mult_y.shape[1]
        else:
            cost = len(select_labs)
        label_ind.update(select_labs)
        unlab_ind.difference_update(select_labs)

        # train/test
        X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y)
        model.fit(X=X_tr, y=y_tr)
        pres, pred = model.predict(X[test_idx])
        perf = alibox.calc_performance_metric(y_true=mult_y[test_idx], y_pred=pred, performance_metric='hamming_loss')

        # save
        st = alibox.State(select_index=select_labs, performance=perf, cost=cost)
        saver.add_state(st)

    return copy.deepcopy(saver)


audi_result = []
quire_result = []
random_result = []
mmc_result = []
adaptive_result = []

for round in range(5):
    # init strategies
    audi = QueryMultiLabelAUDI(X, mult_y)
    quire = QueryMultiLabelQUIRE(X, mult_y)
    mmc = QueryMultiLabelMMC(X, mult_y)
    adaptive = QueryMultiLabelAdaptive(X, mult_y)
    random = QueryMultiLabelRandom()

    audi_result.append(main_loop(alibox, round, strategy=audi))
    quire_result.append(main_loop(alibox, round, strategy=quire))
    mmc_result.append(main_loop(alibox, round, strategy=mmc))
    adaptive_result.append(main_loop(alibox, round, strategy=adaptive))
    random_result.append(main_loop(alibox, round, strategy=random))

analyser = alibox.get_experiment_analyser(x_axis='cost')
analyser.add_method(method_name='AUDI', method_results=audi_result)
analyser.add_method(method_name='QUIRE', method_results=quire_result)
analyser.add_method(method_name='RANDOM', method_results=random_result)
analyser.add_method(method_name='MMC', method_results=mmc_result)
analyser.add_method(method_name='Adaptive', method_results=adaptive_result)
analyser.plot_learning_curves()

可以看到我的代码和示例代码的唯一区别就是我更改了数据源X和mult_y
X是一个2407*294维的ndarray, mult_y是一个2407*6的ndarray。
X是样本特征集合，mult_y是标签集合。
mult_y如下

array([[ 1, -1, -1, -1,  1, -1],                            
[ 1, -1, -1, -1, -1,  1],                            
[ 1, -1, -1, -1, -1, -1],                            
 ...,
[-1, -1, -1, -1, -1,  1],                            
[-1, -1, -1, -1, -1,  1],                            
[-1, -1, -1, -1, -1,  1]])

X如下

array([[0.646467  , 0.666435  , 0.685047  , ..., 0.247298  ,
0.0140249 , 0.0297093 ],
[0.770156  , 0.767255  , 0.761053  , ..., 0.137833  ,
0.0826722 ,0.0363203 ],
[0.793984  , 0.772096  , 0.76182   , ..., 0.0511252 ,
0.112506 ,0.0839236 ],
 ...,
[0.952281  , 0.944987  , 0.905556  , ..., 0.0319002 ,
0.0175471 ,0.0197344 ]])

我的数据格式是符合Alipy的规范的，为什么运行会报错：
ValueError: cannot reshape array of size 0 into shape (0, newaxis)

Problem with QueryInstanceCoresetGreedy and other strategies that take train_idx argument

This error can be easily replicated by running the example in examples/AL_settings/query_instance.py
If you change the batch size to 10, you'll get an error. Here, it's an IndexError (see below), in my own application I get an Assertion Error. Regardless of error type, it's always caused by the assert self.train_idx[ind] in unlabel_index line that several query strategies use (those that take train_idx as argument). It's strange because the select method of the QueryInstanceCoresetGreedy class has a batch_size argument with default 1, so it should take batches...

`---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
in
96 eer_result.append(copy.deepcopy(main_loop(alibox, eer, round)))
97 rnd_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))
---> 98 cors_result.append(copy.deepcopy(main_loop(alibox, cors, round)))
99 dw_result.append(copy.deepcopy(main_loop(alibox, dw, round)))
100 quire_result.append(copy.deepcopy(main_loop(alibox, quire, round)))

in main_loop(alibox, strategy, round)
39 # Select a subset of Uind according to the query strategy
40 # Passing model=None to use the default model for evaluating the committees' disagreement
---> 41 select_ind = strategy.select(label_index=label_ind, unlabel_index=unlab_ind, batch_size=10, model=model)
42 label_ind.update(select_ind)
43 unlab_ind.difference_update(select_ind)

~/anaconda3/lib/python3.8/site-packages/alipy/query_strategy/query_labels.py in select(self, label_index, unlabel_index, batch_size, **kwargs)
1877 # should have min_distance of zero to a cluster center.
1878 assert ind not in self.already_selected
-> 1879 assert self.train_idx[ind] in unlabel_index
1880
1881 self.update_distances([ind], only_new=True, reset_dist=False)

IndexError: index 1250 is out of bounds for axis 0 with size 350`

Problem with ExpectedErrorReduction for instance selection

Hi,

I am running experiments for multi-class classification and get the following error for EpectedErrorReduction:
File "/home/julia/master_thesis/env/lib/python3.6/site-packages/alipy/query_strategy/query_labels.py", line 829, in select
score.append(pv[i, yi] * self.log_loss(prob))
IndexError: index 4 is out of bounds for axis 1 with size 4

I think the Error is that my initial seed set (label index) does not contain all labels which can be found in y.
In the following code, shouldn't it be

classes = np.unique(label_y)
instead of
classes = np.unique(self.y)?

`` if self.X is None or self.y is None:
raise Exception('Data matrix is not provided.')
if model is None:
model = LogisticRegression(solver='liblinear')
model.fit(self.X[label_index if isinstance(label_index, (list, np.ndarray)) else label_index.index],
self.y[label_index if isinstance(label_index, (list, np.ndarray)) else label_index.index])

    unlabel_x = self.X[unlabel_index]
    label_y = self.y[label_index]
    ##################################

    classes = np.unique(self.y)
    pv, spv = _get_proba_pred(unlabel_x, model)
    scores = []
    for i in range(spv[0]):
        new_train_inds = np.append(label_index, unlabel_index[i])
        new_train_X = self.X[new_train_inds, :]
        unlabel_ind = list(unlabel_index)
        unlabel_ind.pop(i)
        new_unlabel_X = self.X[unlabel_ind, :]
        score = []
        for yi in classes:
            new_model = copy.deepcopy(model)
            new_model.fit(new_train_X, np.append(label_y, yi))
            prob = new_model.predict_proba(new_unlabel_X)
            score.append(pv[i, yi] * self.log_loss(prob))
        scores.append(np.sum(score))

    return unlabel_index[nsmallestarg(scores, batch_size)]``

Using alipy with Tensorflow

Hello, alipy is unable to read the model I created using Tensorflow. I tried both creating a model and then reading it using AlExperiment and defining the model directly in the dialogs, both to no avail. Could you provide an example on how to rectify this/how to use Tensorflow models to be used for AL in your function please ? Any feedback would be greatly appreciated. Thanks !!!

The AL algorithms didn't work

Thanks so much for your great work.

It seems that the AL algorithms had no better score. Compared to random sampling , the Uncertainty and QBD had no better accuracy. The datasets are sklearn's load_breast_cancer.

Besides, the random_query has one more point with the same stoping criterion.

Looking forward your reply.

Code is shown below :

# coding: utf-8

from sklearn.datasets import load_breast_cancer
# Ploting
from matplotlib.pylab import plt
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")
from sklearn.ensemble import RandomForestClassifier

import xgboost
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_iris,load_breast_cancer
from alipy.experiment.al_experiment import AlExperiment

import copy
from sklearn.datasets import make_classification
from alipy import ToolBox
from alipy.query_strategy.query_labels import QueryInstanceGraphDensity, QueryInstanceQBC,     QueryInstanceQUIRE, QueryRandom, QueryInstanceUncertainty, QureyExpectedErrorReduction, QueryInstanceLAL

dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.2, split_count=5)


model = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=0)


stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 8)
def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = strategy.select(label_ind, unlab_ind,batch_size=50)
#         print(len(select_ind))
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                  y_pred=pred,
                                                  performance_metric='roc_auc_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    return saver

unc_result = []
qbc_result = []
eer_result = []
quire_result = []
density_result = []


for round in range(3):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)

    # Use pre-defined strategy
    unc = QueryInstanceUncertainty(X, y)
    qbc = QueryInstanceQBC(X, y)

    unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
    qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))


random = QueryRandom(X, y)
random_result = []

for round in range(3):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # calc the initial point
    model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
    pred = model.predict(X[test_idx, :])
    accuracy = sum(pred == y[test_idx]) / len(test_idx)
    saver.set_initial_point(accuracy)

    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = random.select(unlab_ind, batch_size=50)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                y_pred=pred,
                                                performance_metric='roc_auc_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)
        saver.save()

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    random_result.append(copy.deepcopy(saver))
    
analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
analyser.add_method(method_name='Unc', method_results=unc_result)
analyser.add_method(method_name='Rnd', method_results=random_result)
analyser.add_method(method_name='QBC', method_results=qbc_result)

print(analyser)
analyser.plot_learning_curves(title='Example of alipy', std_area=False)

A bug in experiment/al_experiment.py

In Line 149, the array should contain QueryInstanceRandom but not QueryRandom. Because line 163 judge strategy is QueryInstanceRandom.

Error in website tutorial?

In the website tutorial at http://parnec.nuaa.edu.cn/huangsj/alipy/10_mins_to_alipy.html
there is no initial fit of the model and so I get an error when running the line:

select_ind = uncertainStrategy.select(label_ind, unlab_ind, model=model, batch_size=1)
NotFittedError: Call fit before prediction

It looks to me like you need to run

model.fit(X=X[label_ind.index, :], y=y[label_ind.index])

after the alibox.getsplit line.

Also, round is a keyword in python, so this should be replaced by something else e.g. fold

regression

Hello, how can I use its regression task？
When I give the toolbox a regressive label, it always produces the following errors
ValueError: Unknown label type: (array([339.731, 340.059, 340.404, ..., 421.542, 423.743, 423.307]),)

time.clock() has been removed from py38

“The function time.clock() has been removed, after having been deprecated since Python 3.3: use time.perf_counter() or time.process_time() instead, depending on your requirements, to have well-defined behavior. (Contributed by Matthias Bussonnier in bpo-36895.)”

中文官网

希望能够支持中文官网和reademe

Possible bug in get_query_strategy/query_labels.py

In class "QueryInstanceQUIRE" there is a function named "select" and an argument of it is batch_size. But I found the argument doesn't really have an effect on the result. If the batch_size is less than the number of unlabeled items, the number of function output will always be 1.

LSTM

how to add zip file of LSTM to matlab?

二分类index error问题

您好！我在进行二分类，可以确定输入的x和y都是没有问题的，请问为什么会出现index error呢？还有就是这个41712是在哪里出现的呢？

会考虑支持更多的多标签主动学习算法吗

首先感谢Alipy开发者的工作给我们这些使用者减轻了很多的负担，但我目前发现Alipy在多标签主动学习方面的算法都不是很新。
Alipy目前支持QUIRE(2014), AUDI(2013), MMC(2009), ADAPTIVE(2013)等算法。
会考虑支持更多优秀的多标签主动学习算法吗？比如2DAL等等。
还是因为没有实现的那些算法在性能上和这四个相比没有足够的提升，所以暂时不需要实现它们？

Possible Bug in get_query_strategy/query_labels.py Line 1279-1285

When I use QueryInstanceSPAL to replace the "strategy" parameter in "al.set_query_strategy(strategy)", this line "al.start_query(multi_thread=True)" can not be done because in multi_thread model, pickle.dump would throw an error, namely 'TypeError: can't pickle module objects error'.
So, I suggest that "import cvxpy" is located at the out of the class to avoid that.
"cvxpy" can be treated as a basic dependency.

QueryinsrtanceBMDR

after I adopted the query strategy QueryinstanceBMDR,the problem raised that(ImportError: This method need cvxpy to solve the QP problem.Please refer to https://www.cvxpy.org/install/index.html install cvxpy manually before using.)
so I tried to pip the cvxpy on my computer,then it still raised the error as I mentioned above?
so would you mind I bother you taking a short time to help me fix it up?

input shape issue with LSTM model

I want to use the promising alipy library to perform active learning on a timeseries classifier model built with keras and tensorflow. The model is a LSTM, which leads to a 3D input shape.
Is there any way to use the alipy library then?
Up to now I changed you 10min-tutorial code so that it uses my own model. However the code stops at the very top when I initialize the Toolbox, because of the input shape of the xdata.

How to use multi-thread in multi-label setting

Hello, everyone, I am a green hand to Alipy. I am so lucky to find such a good project: Alipy.
I haven't found a example for using multi-thread in multi-label setting, so I write one referring to aceThreading_usage.py

from sklearn.datasets import load_iris
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from alipy import ToolBox
from alipy.data_manipulate import StandardScale
from alipy.query_strategy.multi_label import QueryMultiLabelMMC, LabelRankingModel
from alipy.index import get_Xy_in_multilabel
from alipy.experiment import State

# load data
X, y = load_iris(return_X_y=True)
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())
mult_y[mult_y == 0] = -1

# init alibox
alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, all_class=False)


def target_func(round, train_id, test_id, Lcollection, Ucollection, saver, examples, labels, global_parameters):
    qs = QueryMultiLabelMMC(examples, labels)
    model = LabelRankingModel()

    while len(Ucollection) > 30:
        select_index = qs.select(Lcollection, Ucollection)
        Ucollection.difference_update(select_index)
        Lcollection.update(select_index)

        # update model
        X_tr, y_tr, _ = get_Xy_in_multilabel(Lcollection, X=examples, y=labels, unknown_element=0)
        model.fit(X=X_tr, y=y_tr)
        _, pred = model.predict(examples[test_id, :])

        # calculate micro—f1
        Z = pred
        Y = labels[test_id]
        precision = np.sum(Z & Y) / max(1, np.sum(Z))
        recall = np.sum(Z & Y) / max(1, np.sum(Y))
        micro_f1 = 0 if precision == 0 and recall == 0 else \
                (2 * precision * recall) / (precision + recall)

        # save intermediate results
        st = State(select_index=select_index, performance=micro_f1)
        saver.add_state(st)
        saver.save()


# init acethread
acethread = alibox.get_ace_threading(target_function=target_func)
acethread.start_all_threads()

# get the result,return list of stateIO
stateIO_list = acethread.get_results()

# save the state of multi_thread to the saving_path in pkl form
acethread.save()

I always receive a error:
Exception: Label_size can not be induced from fully labeled set, label_size must be provided
in the line
acethread = alibox.get_ace_threading(target_function=target_func)
Can anyone hele me? I will be very grateful!

The predict_proba method does not exist in svm.SVC of sklearn in latest version 0.21.2

Hi @NUAA-AL @tangypnuaa

At first, thank you for your great work in Active Learning.

Today, when I implemented sklearn.svm.SVC in my active learning program based on uncertainly Sampling of Alipy, I found this error: "Exception: model object must implement predict_proba methods in current algorithm." I searched the related information of SVC on sklearn and I did find this predict_proba method in the latest version 0.21.2. BUT why this error???

The axis of plot_learning_curves is not integer, which is awkward.

The axis of plot_learning_curves is not integer, which is awkward. What can I do to revise it ?

Is alipy compatible with sequence tagging?

I did the example with the iris dataset, so I did some classification. I was wondering if it was also possible to use a NER corpus with alipy? Thank you!

ImportError: cannot import name 'ToolBox'

我已经成功安装了alipy，用这个命令：pip3 install alipy.
但是当我运行from alipy import ToolBox，我收到了这个错误：ImportError: cannot import name 'ToolBox'
有人可以帮助我吗？我将无比感激。

Can alipy be used for multi-classification problems? Is there any sample code to look at?

Manipulating the training and test sets

Hi, I have a slight data manipulation problem. Instead of splitting the entire data set automatically with split_AL, I want to use a pre-defined training set with instances drawn from a pre-defined test set for queries. How might I go about doing this using alipy ?

Please advise, thanks !!!

Question regarding query selection

Hi,

I already run a few experiments for different text classification datasets with class imbalance and in many cases, Random sampling of indexes usually performs better than most (sometimes all) other sampling methods. Does that mean AL fails for text classification?

Problems with select_by_prediction_mat function of class QueryInstanceUncertainty

Hi,

I try to do instance selection with the select_by_prediction_mat function. My predict 2D-Array looks like this:

[['1' '0.5258755683898926']
['2' '0.5193215012550354']
['1' '0.557829737663269']
...
['1' '0.550930380821228']
['1' '0.5237742066383362']
['1' '0.5445562601089478']]

If I use this as my predict input for the function with entropy as measure, I get the following error:
Traceback (most recent call last):
File "test_flair.py", line 174, in
select_ind = Uncertaint_Strategy.select_by_prediction_mat(unlab_ind, pred_ma, batch_size=5)
File "/home/julia/master_thesis/env/lib/python3.6/site-packages/alipy/query_strategy/query_labels.py", line 215, in select_by_prediction_mat
pv[pv <= 0] = 1e-06 # avoid zero division
TypeError: '<=' not supported between instances of 'numpy.ndarray' and 'int'

Could you tell me in detail how the predict matrix should look like? I know it says [n_samples, n_classes], but I have problems to understand exactly what that means. Could you tell me for my predict matrix how I should transform it so it will work?

thank you in advance

Adaptive算法实现有问题？

嗨，您好，我最近在研究AliPy的代码。(multilabel.py 1037-1045行)

ALiPy/alipy/query_strategy/multi_label.py

Line 1038 in ab6c1e3

pos[br_real < 0] = 1

这里的Adaptive算法实现似乎有点小问题。
设想这么一种情况，br_real矩阵每个元素都是2，那么pos矩阵的结果可能为负数。
参考原论文

可知，pos不可能存在负数。
这里忽略了br_real元素都大于0的情况。同样，下面在计算neg的时候也有同样的问题。

Possible bug in get_query_strategy inside toolbox.py

ALiPy/alipy/toolbox.py

Lines 402 to 409 in 71d716d

 if kwargs.pop('train_idx', None) is None: 

 raise ValueError( 

 "Missing necessary parameter 'train_idx' in GraphDensity or QUIRE method.") 

 if strategy_name == 'QueryInstanceGraphDensity': 

 query_function = QueryInstanceGraphDensity(self._X, 

 self._y, 

 train_idx=kwargs.pop( 

 'train_idx'))

When I tried to run ''QueryInstanceGraphDensity' it gave me an error: "Missing key 'train_idx' ", even when I was providing this key.

One possible bug could be that in line 402, kwargs.pop('train_idx', None) is being called which removes the train_idx key from the dictionary. Thus when it is called again in line 408, it gives an error as the key is no longer present due to the previous 'pop' statement.

Do let me know if I am missing out anything.

scaling of y changed after ToolBox.get_query_strategy(strategy_name='QueryInstanceBMDR', kernel='rbf')

When loading both strategies that use cvxpy, the values of ToolBox._y change from (0, 1) to (-1, 1). Since the ToolBox object does not create a copy but instead only stores a reference to y, the actual values in y are changed as well. Other strategies in query_labels.py do not display such behavior.

This to me seems problematic, as the original data should not be changed. It is also a problem because if you run several models from the same ToolBox object, that object in general should not change between models.

See screenshot for quick demonstration, the data is split in X and y, both np.arrays. Y is either 0 or 1 (see printouts).

I use colab, so the alipy version is always freshly installed.

运行报错：ZeroDivisionError: division by zero

您好，我使用alipy在scene数据集(多标签数据集)上比较MMC和Random的性能。但是不知为何，总是报错ZeroDivisionError: division by zero。我把alipy的文档再三看过了，我认为我的代码是没有问题的。

import numpy as np
import copy
from alipy import ToolBox
from alipy.index import get_Xy_in_multilabel
from alipy.query_strategy.multi_label import QueryMultiLabelAUDI, QueryMultiLabelMMC, \
                        QueryMultiLabelAdaptive, QueryMultiLabelRandom, LabelRankingModel

import arff
dataset = arff.load(open('scene.arff', 'r'))
data = np.array(dataset['data'])
X = data[:, :294].astype('float64')
mult_y = data[:, 294:]
mult_y[mult_y == 0] = -1
mult_y = mult_y.astype('float64')

alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)


def main_loop(alibox, round, strategy):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # base model
    model = LabelRankingModel()

    # A simple stopping criterion to specify the query budget.
    while len(label_ind) <= 1500:
        # query and update
        import ipdb; ipdb.set_trace(context=7)
        select_labs = strategy.select(label_ind, unlab_ind)
        # use cost to record the amount of queried instance-label pairs
        if len(select_labs[0]) == 1:
            cost = mult_y.shape[1]
        else:
            cost = len(select_labs)
        label_ind.update(select_labs)
        unlab_ind.difference_update(select_labs)

        # train/test
        X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y, unknown_element=0)
        model.fit(X=X_tr, y=y_tr)
        pres, pred = model.predict(X[test_idx])
        perf = alibox.calc_performance_metric(y_true=mult_y[test_idx], y_pred=pred, performance_metric='hamming_loss')

        # save
        st = alibox.State(select_index=select_labs, performance=perf, cost=cost)
        saver.add_state(st)

    return copy.deepcopy(saver)


audi_result = []
random_result = []
mmc_result = []
adaptive_result = []

for round in range(5):
    # init strategies
    # audi = QueryMultiLabelAUDI(X, mult_y)
    mmc = QueryMultiLabelMMC(X, mult_y)
    # adaptive = QueryMultiLabelAdaptive(X, mult_y)
    random = QueryMultiLabelRandom()

    # audi_result.append(main_loop(alibox, round, strategy=audi))
    mmc_result.append(main_loop(alibox, round, strategy=mmc))
    # adaptive_result.append(main_loop(alibox, round, strategy=adaptive))
    random_result.append(main_loop(alibox, round, strategy=random))

analyser = alibox.get_experiment_analyser(x_axis='cost')
# analyser.add_method(method_name='AUDI', method_results=audi_result)
analyser.add_method(method_name='RANDOM', method_results=random_result)
analyser.add_method(method_name='MMC', method_results=mmc_result)
# analyser.add_method(method_name='Adaptive', method_results=adaptive_result)
analyser.plot_learning_curves()

Debug显示问题就出在

# train/test
X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y, unknown_element=0)
model.fit(X=X_tr, y=y_tr)
pres, pred = model.predict(X[test_idx])
perf = alibox.calc_performance_metric(y_true=mult_y[test_idx], y_pred=pred, performance_metric='hamming_loss')

里面的

pres, pred = model.predict(X[test_idx])

这一句。
我对于LabelRank算法实在不熟悉，只是大致知道这个是通过对标签排序来预测未标记数据的一个方法。我认为问题出在LabelRank算法的实现上，因为我在debug的过程中没有发现任何问题。
scene数据集下载地址
我是通过libac-arff读取的arff文件，我确定数据读取没有问题。
libac-arff安装方式为：pip install liac-arff
运行我上面的代码就可以重现这个问题。
请问您可以帮帮我吗？这个问题我已经花了很久时间但没有解决。

can alipy be used in pytorch?

Hi, can I use alipy in a CNN or LSTM model in Pytorch?

	if kwargs.pop('train_idx', None) is None:
	raise ValueError(
	"Missing necessary parameter 'train_idx' in GraphDensity or QUIRE method.")
	if strategy_name == 'QueryInstanceGraphDensity':
	query_function = QueryInstanceGraphDensity(self._X,
	self._y,
	train_idx=kwargs.pop(
	'train_idx'))