One-hot encoding
The code
In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
In [2]:
X = np.array([["A"],["A"],["B"],["C"]])
In [3]:
X
Out[3]:
array([['A'], ['A'], ['B'], ['C']], dtype='<U1')
In [4]:
enc = OneHotEncoder()
In [5]:
enc.fit_transform(X).todense()
Out[5]:
matrix([[1., 0., 0.], [1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
Output with dense matrix¶
In [6]:
enc = OneHotEncoder(sparse=False)
In [7]:
enc.fit_transform(X)
Out[7]:
array([[1., 0., 0.], [1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
Removing one dummy variable¶
In [8]:
enc = OneHotEncoder(sparse=False, drop='first')
In [9]:
enc.fit_transform(X)
Out[9]:
array([[0., 0.], [0., 0.], [1., 0.], [0., 1.]])
Removing one dummy variable from binary features¶
In [10]:
X = np.array([["A"],["A"],["A"],["C"]])
In [11]:
X
Out[11]:
array([['A'], ['A'], ['A'], ['C']], dtype='<U1')
In [12]:
enc = OneHotEncoder(sparse=False, drop="if_binary")
enc.fit_transform(X)
Out[12]:
array([[0.], [0.], [0.], [1.]])
Error handling¶
In [13]:
enc = OneHotEncoder(sparse=False, handle_unknown = 'error')
enc.fit(X)
Out[13]:
OneHotEncoder(sparse=False)
In [14]:
enc.transform(X)
Out[14]:
array([[1., 0.], [1., 0.], [1., 0.], [0., 1.]])
In [15]:
Y = np.array([["A"],["A"],["B"],["C"]])
In [16]:
enc.transform(Y)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-16-db506db4b582> in <module> ----> 1 enc.transform(Y) c:\users\gianl\appdata\local\programs\python\python38-32\lib\site-packages\sklearn\preprocessing\_encoders.py in transform(self, X) 426 check_is_fitted(self) 427 # validation of X happens in _check_X called by _transform --> 428 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) 429 430 n_samples, n_features = X_int.shape c:\users\gianl\appdata\local\programs\python\python38-32\lib\site-packages\sklearn\preprocessing\_encoders.py in _transform(self, X, handle_unknown) 122 msg = ("Found unknown categories {0} in column {1}" 123 " during transform".format(diff, i)) --> 124 raise ValueError(msg) 125 else: 126 # Set the problematic rows to an acceptable value and ValueError: Found unknown categories ['B'] in column 0 during transform
In [17]:
enc = OneHotEncoder(sparse=False, handle_unknown = 'ignore')
In [18]:
enc.fit(X)
Out[18]:
OneHotEncoder(handle_unknown='ignore', sparse=False)
In [19]:
enc.transform(Y)
Out[19]:
array([[1., 0.], [1., 0.], [0., 0.], [0., 1.]])
In [20]:
X = [["A","X"],["B","Y"],["C","Z"]]
In [21]:
enc = OneHotEncoder(sparse=False, categories = [["A","B","C","D"], ["X","Y","Z"]])
In [22]:
enc.fit_transform(X)
Out[22]:
array([[1., 0., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1., 0.], [0., 0., 1., 0., 0., 0., 1.]])
In [23]:
Y = [["D","Z"]]
In [24]:
enc.transform(Y)
Out[24]:
array([[0., 0., 0., 1., 0., 0., 1.]])
0 comments