diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 41069f8..372a807 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -631,7 +631,7 @@ "outputs": [], "source": [ "# 이 버전의 test_set_check() 함수가 파이썬 2도 지원합니다.\n", - "def test_set_check(identifier, test_ratio, hash):\n", + "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio" ] }, @@ -2597,6 +2597,13 @@ "이제 범주형 입력 특성인 `ocean_proximity`을 전처리합니다:" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 책에 실린 방법" + ] + }, { "cell_type": "code", "execution_count": 59, @@ -3057,6 +3064,129 @@ "cat_encoder.categories_" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### future_encoders.py를 사용한 새로운 방법" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_cat = housing[['ocean_proximity']]\n", + "housing_cat.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "주의: 번역서는 판다스의 `Series.factorize()` 메서드를 사용하여 문자열 범주형 특성을 정수로 인코딩합니다. 사이킷런 0.20에 추가될 `OrdinalEncoder` 클래스(PR #10521)는 입력 특성(레이블 `y`가 아니라 `X`)을 위해 설계되었고 파이프라인(나중에 이 노트북에서 나옵니다)과 잘 작동되기 때문에 더 좋은 방법입니다. 지금은 `future_encoders.py` 파일에서 임포트하지만 사이킷런 0.20 버전이 릴리스되면 `sklearn.preprocessing`에서 바로 임포팅할 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from future_encoders import OrdinalEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ordinal_encoder = OrdinalEncoder()\n", + "housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)\n", + "housing_cat_encoded[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ordinal_encoder.categories_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "주의: 번역서는 `CategoricalEncoder`를 사용하여 각 범주형 값을 원-핫 벡터로 변경합니다. `OneHotEncoder`를 사용하는 것이 더 낫습니다. 지금은 정수형 범주 입력만 다룰 수 있지만 사이킷런 0.20에서는 문자열 범주 입력도 다룰 수 있을 것입니다(PR #10521). 지금은 `future_encoders.py` 파일에서 임포트하지만 사이킷런 0.20 버전이 릴리스되면 `sklearn.preprocessing`에서 바로 임포팅할 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from future_encoders import OneHotEncoder\n", + "\n", + "cat_encoder = OneHotEncoder()\n", + "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n", + "housing_cat_1hot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "기본적으로 `OneHotEncoder` 클래스는 희소 행렬을 반환하지만 필요하면 `toarray()` 메서드를 호출하여 밀집 배열로 바꿀 수 있습니다:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_cat_1hot.toarray()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "또는 `OneHotEncoder` 객체를 만들 때 `sparse=False`로 지정하면 됩니다:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder = OneHotEncoder(sparse=False)\n", + "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n", + "housing_cat_1hot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder.categories_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 다시 책의 내용이 이어집니다" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -3237,7 +3367,9 @@ } ], "source": [ - "housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n", + "housing_extra_attribs = pd.DataFrame(\n", + " housing_extra_attribs, \n", + " columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n", "housing_extra_attribs.head()" ] }, @@ -3353,6 +3485,19 @@ " ])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# future_encoders.py를 사용한 방법\n", + "cat_pipeline = Pipeline([\n", + " ('selector', DataFrameSelector(cat_attribs)),\n", + " ('cat_encoder', OneHotEncoder(sparse=False)),\n", + " ])" + ] + }, { "cell_type": "code", "execution_count": 75, @@ -4789,6 +4934,74 @@ "final_rmse" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "테스트 RMSE에 대한 95% 신뢰 구간을 계산할 수 있습니다:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy import stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "confidence = 0.95\n", + "squared_errors = (final_predictions - y_test) ** 2\n", + "mean = squared_errors.mean()\n", + "m = len(squared_errors)\n", + "\n", + "np.sqrt(stats.t.interval(confidence, m - 1,\n", + " loc=np.mean(squared_errors),\n", + " scale=stats.sem(squared_errors)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "다음과 같이 수동으로 계산할 수도 있습니다:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n", + "tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)\n", + "np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "또는 t 점수 대신 z 점수를 사용할 수도 있습니다:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zscore = stats.norm.ppf((1 + confidence) / 2)\n", + "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n", + "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -6182,7 +6395,7 @@ "from scipy.stats import expon, reciprocal\n", "\n", "# expon(), reciprocal()와 다른 확률 분포 함수에 대해서는\n", - "# https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html를 참고하세요.\n", + "# https://docs.scipy.org/doc/scipy/reference/stats.html를 참고하세요.\n", "\n", "# 노트: kernel 매개변수가 \"linear\"일 때는 gamma가 무시됩니다.\n", "param_distribs = {\n", diff --git a/future_encoders.py b/future_encoders.py new file mode 100644 index 0000000..defd89e --- /dev/null +++ b/future_encoders.py @@ -0,0 +1,882 @@ +# Authors: Andreas Mueller +# Joris Van den Bossche +# License: BSD 3 clause + +from __future__ import division + +import numbers +import warnings + +import numpy as np +from scipy import sparse + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.externals import six +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted, FLOAT_DTYPES +from sklearn.preprocessing.label import LabelEncoder + + +BOUNDS_THRESHOLD = 1e-7 + + +zip = six.moves.zip +map = six.moves.map +range = six.moves.range + +__all__ = [ + 'OneHotEncoder', + 'OrdinalEncoder' +] + + +def _argmax(arr_or_spmatrix, axis=None): + return arr_or_spmatrix.argmax(axis=axis) + + +def _handle_zeros_in_scale(scale, copy=True): + ''' Makes sure that whenever scale is zero, we handle it correctly. + + This happens in most scalers when we have constant features.''' + + # if we are fitting on 1D arrays, scale might be a scalar + if np.isscalar(scale): + if scale == .0: + scale = 1. + return scale + elif isinstance(scale, np.ndarray): + if copy: + # New array to avoid side-effects + scale = scale.copy() + scale[scale == 0.0] = 1.0 + return scale + + +def _transform_selected(X, transform, selected="all", copy=True): + """Apply a transform function to portion of selected features + + Parameters + ---------- + X : {array-like, sparse matrix}, shape [n_samples, n_features] + Dense array or sparse matrix. + + transform : callable + A callable transform(X) -> X_transformed + + copy : boolean, optional + Copy X even if it could be avoided. + + selected: "all" or array of indices or mask + Specify which features to apply the transform to. + + Returns + ------- + X : array or sparse matrix, shape=(n_samples, n_features_new) + """ + X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) + + if isinstance(selected, six.string_types) and selected == "all": + return transform(X) + + if len(selected) == 0: + return X + + n_features = X.shape[1] + ind = np.arange(n_features) + sel = np.zeros(n_features, dtype=bool) + sel[np.asarray(selected)] = True + not_sel = np.logical_not(sel) + n_selected = np.sum(sel) + + if n_selected == 0: + # No features selected. + return X + elif n_selected == n_features: + # All features selected. + return transform(X) + else: + X_sel = transform(X[:, ind[sel]]) + X_not_sel = X[:, ind[not_sel]] + + if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): + return sparse.hstack((X_sel, X_not_sel)) + else: + return np.hstack((X_sel, X_not_sel)) + + +class _BaseEncoder(BaseEstimator, TransformerMixin): + """ + Base class for encoders that includes the code to categorize and + transform the input features. + + """ + + def _fit(self, X, handle_unknown='error'): + + X_temp = check_array(X, dtype=None) + if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): + X = check_array(X, dtype=np.object) + else: + X = X_temp + + n_samples, n_features = X.shape + + if self.categories != 'auto': + for cats in self.categories: + if not np.all(np.sort(cats) == np.array(cats)): + raise ValueError("Unsorted categories are not yet " + "supported") + if len(self.categories) != n_features: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + + self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] + + for i in range(n_features): + le = self._label_encoders_[i] + Xi = X[:, i] + if self.categories == 'auto': + le.fit(Xi) + else: + if handle_unknown == 'error': + valid_mask = np.in1d(Xi, self.categories[i]) + if not np.all(valid_mask): + diff = np.unique(Xi[~valid_mask]) + msg = ("Found unknown categories {0} in column {1}" + " during fit".format(diff, i)) + raise ValueError(msg) + le.classes_ = np.array(self.categories[i]) + + self.categories_ = [le.classes_ for le in self._label_encoders_] + + def _transform(self, X, handle_unknown='error'): + + X_temp = check_array(X, dtype=None) + if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): + X = check_array(X, dtype=np.object) + else: + X = X_temp + + _, n_features = X.shape + X_int = np.zeros_like(X, dtype=np.int) + X_mask = np.ones_like(X, dtype=np.bool) + + for i in range(n_features): + Xi = X[:, i] + valid_mask = np.in1d(Xi, self.categories_[i]) + + if not np.all(valid_mask): + if handle_unknown == 'error': + diff = np.unique(X[~valid_mask, i]) + msg = ("Found unknown categories {0} in column {1}" + " during transform".format(diff, i)) + raise ValueError(msg) + else: + # Set the problematic rows to an acceptable value and + # continue `The rows are marked `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + Xi = Xi.copy() + Xi[~valid_mask] = self.categories_[i][0] + X_int[:, i] = self._label_encoders_[i].transform(Xi) + + return X_int, X_mask + + +WARNING_MSG = ( + "The handling of integer data will change in the future. Currently, the " + "categories are determined based on the range [0, max(values)], while " + "in the future they will be determined based on the unique values.\n" + "If you want the future behaviour, you can specify \"categories='auto'\"." +) + + +class OneHotEncoder(_BaseEncoder): + """Encode categorical integer features as a one-hot numeric array. + + The input to this transformer should be an array-like of integers or + strings, denoting the values taken on by categorical (discrete) features. + The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') + encoding scheme. This creates a binary column for each category and + returns a sparse matrix or dense array. + + By default, the encoder derives the categories based on the unique values + in each feature. Alternatively, you can also specify the `categories` + manually. + The OneHotEncoder previously assumed that the input features take on + values in the range [0, max(values)). This behaviour is deprecated. + + This encoding is needed for feeding categorical data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Note: a one-hot encoding of y labels should use a LabelBinarizer + instead. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + categories : 'auto' or a list of lists/arrays of values. + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + - list : ``categories[i]`` holds the categories expected in the ith + column. The passed categories must be sorted and should not mix + strings and numeric values. + + The used categories can be found in the ``categories_`` attribute. + + sparse : boolean, default=True + Will return sparse matrix if set True else will return an array. + + dtype : number type, default=np.float + Desired dtype of output. + + handle_unknown : 'error' (default) or 'ignore' + Whether to raise an error or ignore if a unknown categorical feature is + present during transform (default is to raise). When this parameter + is set to 'ignore' and an unknown category is encountered during + transform, the resulting one-hot encoded columns for this feature + will be all zeros. In the inverse transform, an unknown category + will be denoted as None. + + n_values : 'auto', int or array of ints + Number of values per feature. + + - 'auto' : determine value range from training data. + - int : number of categorical values per feature. + Each feature value should be in ``range(n_values)`` + - array : ``n_values[i]`` is the number of categorical values in + ``X[:, i]``. Each feature value should be + in ``range(n_values[i])`` + + .. deprecated:: 0.20 + The `n_values` keyword is deprecated and will be removed in 0.22. + Use `categories` instead. + + categorical_features : "all" or array of indices or mask + Specify what features are treated as categorical. + + - 'all' (default): All features are treated as categorical. + - array of indices: Array of categorical feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-categorical features are always stacked to the right of the matrix. + + .. deprecated:: 0.20 + The `categorical_features` keyword is deprecated and will be + removed in 0.22. + + Attributes + ---------- + categories_ : list of arrays + The categories of each feature determined during fitting + (in order corresponding with output of ``transform``). + + active_features_ : array + Indices for active features, meaning values that actually occur + in the training set. Only available when n_values is ``'auto'``. + + .. deprecated:: 0.20 + + feature_indices_ : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + (and then potentially masked by `active_features_` afterwards) + + .. deprecated:: 0.20 + + n_values_ : array of shape (n_features,) + Maximum number of values per feature. + + .. deprecated:: 0.20 + + Examples + -------- + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to a binary one-hot encoding. + + >>> from sklearn.preprocessing import OneHotEncoder + >>> enc = OneHotEncoder(handle_unknown='ignore') + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + ... # doctest: +ELLIPSIS + OneHotEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_unknown='ignore', sparse=True) + + >>> enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() + array([[ 1., 0., 1., 0., 0.], + [ 0., 1., 0., 0., 0.]]) + >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) + array([['Male', 1], + [None, 2]], dtype=object) + + See also + -------- + sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) + encoding of the categorical features. + sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot + encoding of dictionary items or strings. + sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all + fashion. + sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. + """ + + def __init__(self, n_values=None, categorical_features=None, + categories=None, sparse=True, dtype=np.float64, + handle_unknown='error'): + self._categories = categories + if categories is None: + self.categories = 'auto' + else: + self.categories = categories + self.sparse = sparse + self.dtype = dtype + self.handle_unknown = handle_unknown + + if n_values is not None: + pass + # warnings.warn("Deprecated", DeprecationWarning) + else: + n_values = "auto" + self._deprecated_n_values = n_values + + if categorical_features is not None: + pass + # warnings.warn("Deprecated", DeprecationWarning) + else: + categorical_features = "all" + self._deprecated_categorical_features = categorical_features + + # Deprecated keywords + + @property + def n_values(self): + warnings.warn("The 'n_values' parameter is deprecated.", + DeprecationWarning) + return self._deprecated_n_values + + @n_values.setter + def n_values(self, value): + warnings.warn("The 'n_values' parameter is deprecated.", + DeprecationWarning) + self._deprecated_n_values = value + + @property + def categorical_features(self): + warnings.warn("The 'categorical_features' parameter is deprecated.", + DeprecationWarning) + return self._deprecated_categorical_features + + @categorical_features.setter + def categorical_features(self, value): + warnings.warn("The 'categorical_features' parameter is deprecated.", + DeprecationWarning) + self._deprecated_categorical_features = value + + # Deprecated attributes + + @property + def active_features_(self): + check_is_fitted(self, 'categories_') + warnings.warn("The 'active_features_' attribute is deprecated.", + DeprecationWarning) + return self._active_features_ + + @property + def feature_indices_(self): + check_is_fitted(self, 'categories_') + warnings.warn("The 'feature_indices_' attribute is deprecated.", + DeprecationWarning) + return self._feature_indices_ + + @property + def n_values_(self): + check_is_fitted(self, 'categories_') + warnings.warn("The 'n_values_' attribute is deprecated.", + DeprecationWarning) + return self._n_values_ + + def _handle_deprecations(self, X): + + user_set_categories = False + + if self._categories is not None: + self._legacy_mode = False + user_set_categories = True + + elif self._deprecated_n_values != 'auto': + msg = ( + "Passing 'n_values' is deprecated and will be removed in a " + "future release. You can use the 'categories' keyword instead." + " 'n_values=n' corresponds to 'n_values=[range(n)]'.") + warnings.warn(msg, DeprecationWarning) + + # we internally translate this to the correct categories + # and don't use legacy mode + X = check_array(X, dtype=np.int) + + if isinstance(self._deprecated_n_values, numbers.Integral): + n_features = X.shape[1] + self.categories = [ + list(range(self._deprecated_n_values)) + for _ in range(n_features)] + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self._deprecated_n_values) + else: + try: + n_values = np.asarray(self._deprecated_n_values, dtype=int) + self.categories = [list(range(i)) + for i in self._deprecated_n_values] + except (ValueError, TypeError): + raise TypeError( + "Wrong type for parameter `n_values`. Expected 'auto'," + " int or array of ints, got %r".format(type(X))) + + self._n_values_ = n_values + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + self._feature_indices_ = indices + + self._legacy_mode = False + + else: # n_values = 'auto' + if self.handle_unknown == 'ignore': + # no change in behaviour, no need to raise deprecation warning + self._legacy_mode = False + else: + + # check if we have integer or categorical input + try: + X = check_array(X, dtype=np.int) + except ValueError: + self._legacy_mode = False + else: + warnings.warn(WARNING_MSG, DeprecationWarning) + self._legacy_mode = True + + if (not isinstance(self._deprecated_categorical_features, + six.string_types) + or (isinstance(self._deprecated_categorical_features, + six.string_types) + and self._deprecated_categorical_features != 'all')): + if user_set_categories: + raise ValueError( + "The 'categorical_features' keyword is deprecated, and " + "cannot be used together with specifying 'categories'.") + warnings.warn("The 'categorical_features' keyword is deprecated.", + DeprecationWarning) + self._legacy_mode = True + + def fit(self, X, y=None): + """Fit OneHotEncoder to X. + + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + The data to determine the categories of each feature. + + Returns + ------- + self + """ + if self.handle_unknown not in ['error', 'ignore']: + template = ("handle_unknown should be either 'error' or " + "'ignore', got %s") + raise ValueError(template % self.handle_unknown) + + self._handle_deprecations(X) + + if self._legacy_mode: + # TODO not with _transform_selected ?? + self._legacy_fit_transform(X) + return self + else: + self._fit(X, handle_unknown=self.handle_unknown) + return self + + def _legacy_fit_transform(self, X): + """Assumes X contains only categorical features.""" + self_n_values = self._deprecated_n_values + dtype = getattr(X, 'dtype', None) + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + if (isinstance(self_n_values, six.string_types) and + self_n_values == 'auto'): + n_values = np.max(X, axis=0) + 1 + elif isinstance(self_n_values, numbers.Integral): + if (np.max(X, axis=0) >= self_n_values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self_n_values) + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self_n_values) + else: + try: + n_values = np.asarray(self_n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`. Expected" + " 'auto', int or array of ints, got %r" + % type(X)) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + + self._n_values_ = n_values + self.categories_ = [np.arange(n_val - 1, dtype=dtype) + for n_val in n_values] + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + self._feature_indices_ = indices + + column_indices = (X + indices[:-1]).ravel() + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features) + data = np.ones(n_samples * n_features) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + if (isinstance(self_n_values, six.string_types) and + self_n_values == 'auto'): + mask = np.array(out.sum(axis=0)).ravel() != 0 + active_features = np.where(mask)[0] + out = out[:, active_features] + self._active_features_ = active_features + + self.categories_ = [ + np.unique(X[:, i]).astype(dtype) if dtype else np.unique(X[:, i]) + for i in range(n_features)] + #import pdb; pdb.set_trace() + + return out if self.sparse else out.toarray() + + def fit_transform(self, X, y=None): + """Fit OneHotEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Input array of type int. + """ + if self.handle_unknown not in ['error', 'ignore']: + template = ("handle_unknown should be either 'error' or " + "'ignore', got %s") + raise ValueError(template % self.handle_unknown) + + self._handle_deprecations(X) + + if self._legacy_mode: + return _transform_selected(X, self._legacy_fit_transform, + self._deprecated_categorical_features, + copy=True) + else: + return self.fit(X).transform(X) + + def _legacy_transform(self, X): + """Assumes X contains only categorical features.""" + self_n_values = self._deprecated_n_values + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self._feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # We use only those categorical features of X that are known using fit. + # i.e lesser than n_values_ using mask. + # This means, if self.handle_unknown is "ignore", the row_indices and + # col_indices corresponding to the unknown categorical feature are + # ignored. + mask = (X < self._n_values_).ravel() + if np.any(~mask): + if self.handle_unknown not in ['error', 'ignore']: + raise ValueError("handle_unknown should be either error or " + "unknown got %s" % self.handle_unknown) + if self.handle_unknown == 'error': + raise ValueError("unknown categorical feature present %s " + "during transform." % X.ravel()[~mask]) + + column_indices = (X + indices[:-1]).ravel()[mask] + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features)[mask] + data = np.ones(np.sum(mask)) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + if (isinstance(self_n_values, six.string_types) and + self_n_values == 'auto'): + out = out[:, self._active_features_] + + return out if self.sparse else out.toarray() + + def _transform_new(self, X): + """New implementation assuming categorical input""" + X_temp = check_array(X, dtype=None) + if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): + X = check_array(X, dtype=np.object) + else: + X = X_temp + + n_samples, n_features = X.shape + + X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + + mask = X_mask.ravel() + n_values = [cats.shape[0] for cats in self.categories_] + n_values = np.array([0] + n_values) + feature_indices = np.cumsum(n_values) + + indices = (X_int + feature_indices[:-1]).ravel()[mask] + indptr = X_mask.sum(axis=1).cumsum() + indptr = np.insert(indptr, 0, 0) + data = np.ones(n_samples * n_features)[mask] + + out = sparse.csr_matrix((data, indices, indptr), + shape=(n_samples, feature_indices[-1]), + dtype=self.dtype) + if not self.sparse: + return out.toarray() + else: + return out + + def transform(self, X): + """Transform X using one-hot encoding. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data to encode. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array + Transformed input. + """ + if not self._legacy_mode: + return self._transform_new(X) + else: + return _transform_selected(X, self._legacy_transform, + self._deprecated_categorical_features, + copy=True) + + def inverse_transform(self, X): + """Convert back the data to the original representation. + + In case unknown categories are encountered (all zero's in the + one-hot encoding), ``None`` is used to represent this category. + + Parameters + ---------- + X : array-like or sparse matrix, shape [n_samples, n_encoded_features] + The transformed data. + + Returns + ------- + X_tr : array-like, shape [n_samples, n_features] + Inverse transformed array. + + """ + # if self._legacy_mode: + # raise ValueError("only supported for categorical features") + + check_is_fitted(self, 'categories_') + X = check_array(X, accept_sparse='csr') + + n_samples, _ = X.shape + n_features = len(self.categories_) + n_transformed_features = sum([len(cats) for cats in self.categories_]) + + # validate shape of passed X + msg = ("Shape of the passed X data is not correct. Expected {0} " + "columns, got {1}.") + if X.shape[1] != n_transformed_features: + raise ValueError(msg.format(n_transformed_features, X.shape[1])) + + # create resulting array of appropriate dtype + dt = np.find_common_type([cat.dtype for cat in self.categories_], []) + X_tr = np.empty((n_samples, n_features), dtype=dt) + + j = 0 + found_unknown = {} + + for i in range(n_features): + n_categories = len(self.categories_[i]) + sub = X[:, j:j + n_categories] + + # for sparse X argmax returns 2D matrix, ensure 1D array + labels = np.asarray(_argmax(sub, axis=1)).flatten() + X_tr[:, i] = self.categories_[i][labels] + + if self.handle_unknown == 'ignore': + # ignored unknown categories: we have a row of all zero's + unknown = np.asarray(sub.sum(axis=1) == 0).flatten() + if unknown.any(): + found_unknown[i] = unknown + + j += n_categories + + # if ignored are found: potentially need to upcast result to + # insert None values + if found_unknown: + if X_tr.dtype != object: + X_tr = X_tr.astype(object) + + for idx, mask in found_unknown.items(): + X_tr[mask, idx] = None + + return X_tr + + +class OrdinalEncoder(_BaseEncoder): + """Encode categorical features as an integer array. + + The input to this transformer should be an array-like of integers or + strings, denoting the values taken on by categorical (discrete) features. + The features are converted to ordinal integers. This results in + a single column of integers (0 to n_categories - 1) per feature. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + categories : 'auto' or a list of lists/arrays of values. + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + - list : ``categories[i]`` holds the categories expected in the ith + column. The passed categories must be sorted and should not mix + strings and numeric values. + + The used categories can be found in the ``categories_`` attribute. + + dtype : number type, default np.float64 + Desired dtype of output. + + Attributes + ---------- + categories_ : list of arrays + The categories of each feature determined during fitting + (in order corresponding with output of ``transform``). + + Examples + -------- + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to a binary one-hot encoding. + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> enc = OrdinalEncoder() + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + ... # doctest: +ELLIPSIS + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + >>> enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> enc.transform([['Female', 3], ['Male', 1]]) + array([[ 0., 2.], + [ 1., 0.]]) + + >>> enc.inverse_transform([[1, 0], [0, 1]]) + array([['Male', 1], + ['Female', 2]], dtype=object) + + See also + -------- + sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of + categorical features. + sklearn.preprocessing.LabelEncoder : encodes target labels with values + between 0 and n_classes-1. + sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot + encoding of dictionary items or strings. + """ + + def __init__(self, categories='auto', dtype=np.float64): + self.categories = categories + self.dtype = dtype + + def fit(self, X, y=None): + """Fit the OrdinalEncoder to X. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data to determine the categories of each feature. + + Returns + ------- + self + + """ + self._fit(X) + + return self + + def transform(self, X): + """Transform X to ordinal codes. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data to encode. + + Returns + ------- + X_out : sparse matrix or a 2-d array + Transformed input. + + """ + X_int, _ = self._transform(X) + return X_int.astype(self.dtype, copy=False) + + def inverse_transform(self, X): + """Convert back the data to the original representation. + + Parameters + ---------- + X : array-like or sparse matrix, shape [n_samples, n_encoded_features] + The transformed data. + + Returns + ------- + X_tr : array-like, shape [n_samples, n_features] + Inverse transformed array. + + """ + check_is_fitted(self, 'categories_') + X = check_array(X, accept_sparse='csr') + + n_samples, _ = X.shape + n_features = len(self.categories_) + + # validate shape of passed X + msg = ("Shape of the passed X data is not correct. Expected {0} " + "columns, got {1}.") + if X.shape[1] != n_features: + raise ValueError(msg.format(n_features, X.shape[1])) + + # create resulting array of appropriate dtype + dt = np.find_common_type([cat.dtype for cat in self.categories_], []) + X_tr = np.empty((n_samples, n_features), dtype=dt) + + for i in range(n_features): + labels = X[:, i].astype('int64') + X_tr[:, i] = self.categories_[i][labels] + + return X_tr