In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [3]:
data = pd.read_csv("data.csv")
data.head(2)
Out[3]:
Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
In [8]:
x = data.iloc[:,[0,1,2]]
y = data.iloc[:,3]
x
Out[8]:
Country Age Salary
0 France 44.0 72000.0
1 Spain 27.0 48000.0
2 Germany 30.0 54000.0
3 Spain 38.0 61000.0
4 Germany 40.0 NaN
5 France 35.0 58000.0
6 Spain NaN 52000.0
7 France 48.0 79000.0
8 Germany 50.0 83000.0
9 France 37.0 67000.0
In [10]:
x.iloc[:,2].fillna(np.mean(x.iloc[:,2]),inplace = True)
x.iloc[:,1].fillna(np.mean(x.iloc[:,1]),inplace = True)
x
Out[10]:
Country Age Salary
0 France 44.000000 72000.000000
1 Spain 27.000000 48000.000000
2 Germany 30.000000 54000.000000
3 Spain 38.000000 61000.000000
4 Germany 40.000000 63777.777778
5 France 35.000000 58000.000000
6 Spain 38.777778 52000.000000
7 France 48.000000 79000.000000
8 Germany 50.000000 83000.000000
9 France 37.000000 67000.000000
In [84]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
In [12]:
x = pd.get_dummies(x)
x
Out[12]:
Age Salary Country_France Country_Germany Country_Spain
0 44.000000 72000.000000 1 0 0
1 27.000000 48000.000000 0 0 1
2 30.000000 54000.000000 0 1 0
3 38.000000 61000.000000 0 0 1
4 40.000000 63777.777778 0 1 0
5 35.000000 58000.000000 1 0 0
6 38.777778 52000.000000 0 0 1
7 48.000000 79000.000000 1 0 0
8 50.000000 83000.000000 0 1 0
9 37.000000 67000.000000 1 0 0
In [ ]:
 
In [ ]:
 

Comments