Module pulearn.elkanoto
Both PU classification methods from the Elkan & Noto paper.
Expand source code Browse git
"""Both PU classification methods from the Elkan & Noto paper."""
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
class ElkanotoPuClassifier(BaseEstimator, ClassifierMixin):
"""Positive-unlabeled classifier using the unweighted Elkan & Noto method.
Parameters
----------
estimator : sklearn.BaseEstimator
Any sklearn-compliant estimator object implementing the fit() and
predict_proba() methods.
hold_out_ratio : float, default 0.1
The ratio of training examples to set aside to estimate the probability
of an exmaple to be positive.
"""
def __init__(self, estimator, hold_out_ratio=0.1):
self.estimator = estimator
# c is the constant proba that a example is positive, init to 1
self.c = 1.0
self.hold_out_ratio = hold_out_ratio
self.estimator_fitted = False
def __str__(self):
return 'Estimator: {}\np(s=1|y=1,x) ~= {}\nFitted: {}'.format(
self.estimator,
self.c,
self.estimator_fitted,
)
def fit(self, X, y):
"""Fits the classifier
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples]
The target values. An array of int.
Returns
-------
self : object
Returns self.
"""
positives = np.where(y == 1.0)[0]
hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))
# check for the required number of positive examples
if len(positives) <= hold_out_size:
raise ValueError(
'Not enough positive examples to estimate p(s=1|y=1,x).'
' Need at least {}.'.format(hold_out_size + 1)
)
# construct the holdout set
np.random.shuffle(positives)
hold_out = positives[:hold_out_size]
X_hold_out = X[hold_out]
X = np.delete(X, hold_out, 0)
y = np.delete(y, hold_out)
# fit the inner estimator
self.estimator.fit(X, y)
hold_out_predictions = self.estimator.predict_proba(X_hold_out)
hold_out_predictions = hold_out_predictions[:, 1]
# try:
# hold_out_predictions = hold_out_predictions[:, 1]
# except TypeError:
# pass
# update c, the positive proba estimate
c = np.mean(hold_out_predictions)
self.c = c
self.estimator_fitted = True
def predict_proba(self, X):
"""Predict class probabilities for X.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
p : array of shape = [n_samples, n_classes]
The class probabilities of the input samples. The order of the
classes corresponds to that in the attribute classes_.
"""
if not self.estimator_fitted:
raise NotFittedError(
'The estimator must be fitted before calling predict_proba().'
)
probabilistic_predictions = self.estimator.predict_proba(X)
probabilistic_predictions = probabilistic_predictions[:, 1]
return probabilistic_predictions / self.c
def predict(self, X, threshold=0.5):
"""Predict labels.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
threshold : float, default 0.5
The decision threshold over probability to warrent a
positive label.
Returns
-------
y : array of int of shape = [n_samples]
Predicted labels for the given inpurt samples.
"""
if not self.estimator_fitted:
raise NotFittedError(
'The estimator must be fitted before calling predict(...).'
)
return np.array([
1.0 if p > threshold else -1.0
for p in self.predict_proba(X)
])
class WeightedElkanotoPuClassifier(BaseEstimator, ClassifierMixin):
"""Positive-unlabeled classifier using the weighted Elkan & Noto method.
See the original paper for details on how the `labeled` and `unlabeled`
quantities are used to weigh training examples and affect the learning
process:
https://cseweb.ucsd.edu/~elkan/posonly.pdf
Parameters
----------
estimator : sklearn.BaseEstimator
Any sklearn-compliant estimator object implementing the fit() and
predict_proba() methods.
labeled : int
The cardinality to attribute to the labeled training set.
unlabeled : int
The cardinality to attribute to the unlabeled training set.
hold_out_ratio : float, default 0.1
The ratio of training examples to set aside to estimate the probability
of an exmaple to be positive.
"""
def __init__(self, estimator, labeled, unlabeled, hold_out_ratio=0.1):
self.estimator = estimator
self.c = 1.0
self.hold_out_ratio = hold_out_ratio
self.labeled = labeled
self.unlabeled = unlabeled
self.estimator_fitted = False
def __str__(self):
return 'Estimator: {}\np(s=1|y=1,x) ~= {}\nFitted: {}'.format(
self.estimator,
self.c,
self.estimator_fitted,
)
def fit(self, X, y):
"""Fits the classifier
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples]
The target values. An array of int.
Returns
-------
self : object
Returns self.
"""
positives = np.where(y == 1.0)[0]
hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))
# check for the required number of positive examples
if len(positives) <= hold_out_size:
raise ValueError(
'Not enough positive examples to estimate p(s=1|y=1,x).'
' Need at least {}.'.format(hold_out_size + 1)
)
# construct the holdout set
np.random.shuffle(positives)
hold_out = positives[:hold_out_size]
X_hold_out = X[hold_out]
X = np.delete(X, hold_out, 0)
y = np.delete(y, hold_out)
# fit the inner estimator
self.estimator.fit(X, y)
hold_out_predictions = self.estimator.predict_proba(X_hold_out)
hold_out_predictions = hold_out_predictions[:, 1]
# update c, the positive proba estimate
c = np.mean(hold_out_predictions)
self.c = c
self.estimator_fitted = True
# Returns E[y] which is P(y=1)
def _estimateEy(self, G):
n = self.labeled
m = self.labeled + self.unlabeled
G = G[:, 1]
np.place(G, G == 1.0, 0.999)
W = (G / (1 - G)) * ((1 - self.c) / self.c)
return (float(n) + float(W.sum())) / float(m)
def predict_proba(self, X):
"""Predict class probabilities for X.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
p : array of shape = [n_samples, n_classes]
The class probabilities of the input samples. The order of the
classes corresponds to that in the attribute classes_.
"""
if not self.estimator_fitted:
raise NotFittedError(
'The estimator must be fitted before calling predict_proba().'
)
n = self.labeled
m = self.labeled + self.unlabeled
# self.estimator.predict_proba gives the probability of P(s=1|x)
# for x belongs to P or U
probabilistic_predictions = self.estimator.predict_proba(X)
yEstimate = self._estimateEy(probabilistic_predictions)
probabilistic_predictions = probabilistic_predictions[:, 1]
numerator = probabilistic_predictions * (self.c * yEstimate * m)
return numerator / float(n)
def predict(self, X, treshold=0.5):
"""Predict labels.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
threshold : float, default 0.5
The decision threshold over probability to warrent a
positive label.
Returns
-------
y : array of int of shape = [n_samples]
Predicted labels for the given inpurt samples.
"""
if not self.estimator_fitted:
raise NotFittedError(
'The estimator must be fitted before calling predict().'
)
return np.array([
1.0 if p > treshold else -1.0
for p in self.predict_proba(X)
])
Classes
class ElkanotoPuClassifier (estimator, hold_out_ratio=0.1)
-
Positive-unlabeled classifier using the unweighted Elkan & Noto method.
Parameters
estimator
:sklearn.BaseEstimator
- Any sklearn-compliant estimator object implementing the fit() and predict_proba() methods.
hold_out_ratio
:float
, default0.1
The ratio of training examples to set aside to estimate the probability of an exmaple to be positive.
Expand source code Browse git
class ElkanotoPuClassifier(BaseEstimator, ClassifierMixin): """Positive-unlabeled classifier using the unweighted Elkan & Noto method. Parameters ---------- estimator : sklearn.BaseEstimator Any sklearn-compliant estimator object implementing the fit() and predict_proba() methods. hold_out_ratio : float, default 0.1 The ratio of training examples to set aside to estimate the probability of an exmaple to be positive. """ def __init__(self, estimator, hold_out_ratio=0.1): self.estimator = estimator # c is the constant proba that a example is positive, init to 1 self.c = 1.0 self.hold_out_ratio = hold_out_ratio self.estimator_fitted = False def __str__(self): return 'Estimator: {}\np(s=1|y=1,x) ~= {}\nFitted: {}'.format( self.estimator, self.c, self.estimator_fitted, ) def fit(self, X, y): """Fits the classifier Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. An array of int. Returns ------- self : object Returns self. """ positives = np.where(y == 1.0)[0] hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) # check for the required number of positive examples if len(positives) <= hold_out_size: raise ValueError( 'Not enough positive examples to estimate p(s=1|y=1,x).' ' Need at least {}.'.format(hold_out_size + 1) ) # construct the holdout set np.random.shuffle(positives) hold_out = positives[:hold_out_size] X_hold_out = X[hold_out] X = np.delete(X, hold_out, 0) y = np.delete(y, hold_out) # fit the inner estimator self.estimator.fit(X, y) hold_out_predictions = self.estimator.predict_proba(X_hold_out) hold_out_predictions = hold_out_predictions[:, 1] # try: # hold_out_predictions = hold_out_predictions[:, 1] # except TypeError: # pass # update c, the positive proba estimate c = np.mean(hold_out_predictions) self.c = c self.estimator_fitted = True def predict_proba(self, X): """Predict class probabilities for X. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. The order of the classes corresponds to that in the attribute classes_. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict_proba().' ) probabilistic_predictions = self.estimator.predict_proba(X) probabilistic_predictions = probabilistic_predictions[:, 1] return probabilistic_predictions / self.c def predict(self, X, threshold=0.5): """Predict labels. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. threshold : float, default 0.5 The decision threshold over probability to warrent a positive label. Returns ------- y : array of int of shape = [n_samples] Predicted labels for the given inpurt samples. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict(...).' ) return np.array([ 1.0 if p > threshold else -1.0 for p in self.predict_proba(X) ])
Ancestors
- sklearn.base.BaseEstimator
- sklearn.base.ClassifierMixin
Methods
def fit(self, X, y)
-
Fits the classifier
Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]- The training input samples.
y
:array
-like
,shape
= [n_samples
]- The target values. An array of int.
Returns
self
:object
- Returns self.
Expand source code Browse git
def fit(self, X, y): """Fits the classifier Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. An array of int. Returns ------- self : object Returns self. """ positives = np.where(y == 1.0)[0] hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) # check for the required number of positive examples if len(positives) <= hold_out_size: raise ValueError( 'Not enough positive examples to estimate p(s=1|y=1,x).' ' Need at least {}.'.format(hold_out_size + 1) ) # construct the holdout set np.random.shuffle(positives) hold_out = positives[:hold_out_size] X_hold_out = X[hold_out] X = np.delete(X, hold_out, 0) y = np.delete(y, hold_out) # fit the inner estimator self.estimator.fit(X, y) hold_out_predictions = self.estimator.predict_proba(X_hold_out) hold_out_predictions = hold_out_predictions[:, 1] # try: # hold_out_predictions = hold_out_predictions[:, 1] # except TypeError: # pass # update c, the positive proba estimate c = np.mean(hold_out_predictions) self.c = c self.estimator_fitted = True
def predict(self, X, threshold=0.5)
-
Predict labels.
Parameters
X
:array
-like
ofshape
= [n_samples
,n_features
]- The input samples.
threshold
:float
, default0.5
- The decision threshold over probability to warrent a positive label.
Returns
y
:array
ofint
ofshape
= [n_samples
]- Predicted labels for the given inpurt samples.
Expand source code Browse git
def predict(self, X, threshold=0.5): """Predict labels. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. threshold : float, default 0.5 The decision threshold over probability to warrent a positive label. Returns ------- y : array of int of shape = [n_samples] Predicted labels for the given inpurt samples. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict(...).' ) return np.array([ 1.0 if p > threshold else -1.0 for p in self.predict_proba(X) ])
def predict_proba(self, X)
-
Predict class probabilities for X.
Parameters
X
:array
-like
ofshape
= [n_samples
,n_features
]- The input samples.
Returns
p
:array
ofshape
= [n_samples
,n_classes
]- The class probabilities of the input samples. The order of the classes corresponds to that in the attribute classes_.
Expand source code Browse git
def predict_proba(self, X): """Predict class probabilities for X. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. The order of the classes corresponds to that in the attribute classes_. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict_proba().' ) probabilistic_predictions = self.estimator.predict_proba(X) probabilistic_predictions = probabilistic_predictions[:, 1] return probabilistic_predictions / self.c
class WeightedElkanotoPuClassifier (estimator, labeled, unlabeled, hold_out_ratio=0.1)
-
Positive-unlabeled classifier using the weighted Elkan & Noto method.
See the original paper for details on how the
labeled
andunlabeled
quantities are used to weigh training examples and affect the learning process: https://cseweb.ucsd.edu/~elkan/posonly.pdfParameters
estimator
:sklearn.BaseEstimator
- Any sklearn-compliant estimator object implementing the fit() and predict_proba() methods.
labeled
:int
- The cardinality to attribute to the labeled training set.
unlabeled
:int
- The cardinality to attribute to the unlabeled training set.
hold_out_ratio
:float
, default0.1
The ratio of training examples to set aside to estimate the probability of an exmaple to be positive.
Expand source code Browse git
class WeightedElkanotoPuClassifier(BaseEstimator, ClassifierMixin): """Positive-unlabeled classifier using the weighted Elkan & Noto method. See the original paper for details on how the `labeled` and `unlabeled` quantities are used to weigh training examples and affect the learning process: https://cseweb.ucsd.edu/~elkan/posonly.pdf Parameters ---------- estimator : sklearn.BaseEstimator Any sklearn-compliant estimator object implementing the fit() and predict_proba() methods. labeled : int The cardinality to attribute to the labeled training set. unlabeled : int The cardinality to attribute to the unlabeled training set. hold_out_ratio : float, default 0.1 The ratio of training examples to set aside to estimate the probability of an exmaple to be positive. """ def __init__(self, estimator, labeled, unlabeled, hold_out_ratio=0.1): self.estimator = estimator self.c = 1.0 self.hold_out_ratio = hold_out_ratio self.labeled = labeled self.unlabeled = unlabeled self.estimator_fitted = False def __str__(self): return 'Estimator: {}\np(s=1|y=1,x) ~= {}\nFitted: {}'.format( self.estimator, self.c, self.estimator_fitted, ) def fit(self, X, y): """Fits the classifier Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. An array of int. Returns ------- self : object Returns self. """ positives = np.where(y == 1.0)[0] hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) # check for the required number of positive examples if len(positives) <= hold_out_size: raise ValueError( 'Not enough positive examples to estimate p(s=1|y=1,x).' ' Need at least {}.'.format(hold_out_size + 1) ) # construct the holdout set np.random.shuffle(positives) hold_out = positives[:hold_out_size] X_hold_out = X[hold_out] X = np.delete(X, hold_out, 0) y = np.delete(y, hold_out) # fit the inner estimator self.estimator.fit(X, y) hold_out_predictions = self.estimator.predict_proba(X_hold_out) hold_out_predictions = hold_out_predictions[:, 1] # update c, the positive proba estimate c = np.mean(hold_out_predictions) self.c = c self.estimator_fitted = True # Returns E[y] which is P(y=1) def _estimateEy(self, G): n = self.labeled m = self.labeled + self.unlabeled G = G[:, 1] np.place(G, G == 1.0, 0.999) W = (G / (1 - G)) * ((1 - self.c) / self.c) return (float(n) + float(W.sum())) / float(m) def predict_proba(self, X): """Predict class probabilities for X. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. The order of the classes corresponds to that in the attribute classes_. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict_proba().' ) n = self.labeled m = self.labeled + self.unlabeled # self.estimator.predict_proba gives the probability of P(s=1|x) # for x belongs to P or U probabilistic_predictions = self.estimator.predict_proba(X) yEstimate = self._estimateEy(probabilistic_predictions) probabilistic_predictions = probabilistic_predictions[:, 1] numerator = probabilistic_predictions * (self.c * yEstimate * m) return numerator / float(n) def predict(self, X, treshold=0.5): """Predict labels. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. threshold : float, default 0.5 The decision threshold over probability to warrent a positive label. Returns ------- y : array of int of shape = [n_samples] Predicted labels for the given inpurt samples. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict().' ) return np.array([ 1.0 if p > treshold else -1.0 for p in self.predict_proba(X) ])
Ancestors
- sklearn.base.BaseEstimator
- sklearn.base.ClassifierMixin
Methods
def fit(self, X, y)
-
Fits the classifier
Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]- The training input samples.
y
:array
-like
,shape
= [n_samples
]- The target values. An array of int.
Returns
self
:object
- Returns self.
Expand source code Browse git
def fit(self, X, y): """Fits the classifier Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. An array of int. Returns ------- self : object Returns self. """ positives = np.where(y == 1.0)[0] hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) # check for the required number of positive examples if len(positives) <= hold_out_size: raise ValueError( 'Not enough positive examples to estimate p(s=1|y=1,x).' ' Need at least {}.'.format(hold_out_size + 1) ) # construct the holdout set np.random.shuffle(positives) hold_out = positives[:hold_out_size] X_hold_out = X[hold_out] X = np.delete(X, hold_out, 0) y = np.delete(y, hold_out) # fit the inner estimator self.estimator.fit(X, y) hold_out_predictions = self.estimator.predict_proba(X_hold_out) hold_out_predictions = hold_out_predictions[:, 1] # update c, the positive proba estimate c = np.mean(hold_out_predictions) self.c = c self.estimator_fitted = True
def predict(self, X, treshold=0.5)
-
Predict labels.
Parameters
X
:array
-like
ofshape
= [n_samples
,n_features
]- The input samples.
threshold
:float
, default0.5
- The decision threshold over probability to warrent a positive label.
Returns
y
:array
ofint
ofshape
= [n_samples
]- Predicted labels for the given inpurt samples.
Expand source code Browse git
def predict(self, X, treshold=0.5): """Predict labels. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. threshold : float, default 0.5 The decision threshold over probability to warrent a positive label. Returns ------- y : array of int of shape = [n_samples] Predicted labels for the given inpurt samples. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict().' ) return np.array([ 1.0 if p > treshold else -1.0 for p in self.predict_proba(X) ])
def predict_proba(self, X)
-
Predict class probabilities for X.
Parameters
X
:array
-like
ofshape
= [n_samples
,n_features
]- The input samples.
Returns
p
:array
ofshape
= [n_samples
,n_classes
]- The class probabilities of the input samples. The order of the classes corresponds to that in the attribute classes_.
Expand source code Browse git
def predict_proba(self, X): """Predict class probabilities for X. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. The order of the classes corresponds to that in the attribute classes_. """ if not self.estimator_fitted: raise NotFittedError( 'The estimator must be fitted before calling predict_proba().' ) n = self.labeled m = self.labeled + self.unlabeled # self.estimator.predict_proba gives the probability of P(s=1|x) # for x belongs to P or U probabilistic_predictions = self.estimator.predict_proba(X) yEstimate = self._estimateEy(probabilistic_predictions) probabilistic_predictions = probabilistic_predictions[:, 1] numerator = probabilistic_predictions * (self.c * yEstimate * m) return numerator / float(n)