依前一篇的 LOF 將離群值刪除後,再重新進行訓練及預測,結果可以將分數拉到 0.81。如果把 LSTAT ** (1/3),則可以拉高到 0.85
底下為完整代碼
from sklearn.datasets import load_boston import pandas as pd import seaborn as sns import pylab as plt import numpy as np from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.neighbors import LocalOutlierFactor as LOF display=pd.options.display display.max_columns=None display.max_rows=None display.width=None display.max_colwidth=None boston_dataset=load_boston() df=pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names) df.insert(0, column="PRICE", value=boston_dataset.target) df=df[['PRICE','LSTAT','RM']] x=df[['LSTAT','RM']] y=df['PRICE'] lof=LOF(n_neighbors=20, contamination='auto') y_pred=lof.fit_predict(np.c_[x, y]) df=pd.DataFrame(data=df.loc[np.where(y_pred==1)].values, columns=['PRICE','LSTAT','RM']) print(df.shape) data=np.c_[df['LSTAT']**(1/3),df['RM']] x=pd.DataFrame(data=data, columns=['LSTAT','RM']) # x=df[['LSTAT','RM']] y=df['PRICE'] #資料切割 80%訓練, 20%測試 x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=5) model=LinearRegression()#線性迴歸模型 model.fit(x_train, y_train)#模型訓練,資料量大時,會花很久的時間 #分數愈接近1, 表示愈準確 print(f'分數 : {model.score(x_test, y_test)}') #開始預測價格 y_pred=model.predict(x_test) for i in zip(y_pred, y_test): print(i)
結果 :
(455, 3)
分數 : 0.8500263734728444
(17.684860535122077, 17.4)
(27.556142791275335, 25.3)
(30.90162563908891, 36.1)
(29.183795216204057, 32.2)
(25.254389371333477, 22.0)