# 데이터 로드
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_test.csv')
display(train.head(5))
[output]
1. train 데이터의 Flight Distance 컬럼을 사이킷런 모듈을 이용하여 최솟값을 0 최댓값을 1값로 하는 데이터로 변환하고 scaling을 이름으로 하는 컬럼으로 데이터프레임에 추가하라.
# 데이터 로드
import pandas as pd
x = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/x_train.csv')
y = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/y_train.csv')[['Drug']]
display(x.head(5))
display(y.head(5))
[output]
6. x,y데이터에서 train,test세트를 구분하고 train셋의 y값과 test셋의 y값의 unique한 value 값의 숫자를 출력하라. train:test는 7:3비율 , random_state =42로 고정
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
t = pd.concat([y_train.value_counts(), y_test.value_counts()], axis = 1).rename(columns = {0:'train', 1:'test'}).reset_index()
display(t)
[output]
7. x,y데이터에서 train,test세트를 구분하고 train셋의 y값과 test셋의 y값의 unique한 value 값의 비율을 동일하게 추출하라. 7:3비율 , random_state =42로 고정
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3,random_state = 42, stratify = y)
t = pd.concat([y_train.value_counts(), y_test.value_counts()], axis = 1).rename(columns = {0:'train', 1:'test'}).reset_index()
display(t)
[output]
03 모델링
모델링의 기본 골격
model import
model 선언, 초기 하이퍼 파라미터 지정
model.fit(x_train,y_train)을 통한 모델 학습
회귀, 분류 문제 모두 model.predict(x_validation) 을 통한 예측
auc값을 구해야하는 경우 model.predict_proba(x_validation)을 통한 확률 추출 (svm모델의 경우 학습시 probability=True옵션 추가)
원하는 metric으로 모델 평가 eg) accuracy_score(y_validation, model.predict(x_validation))
sklearn 학습 모듈 모음
분류문제의 경우 -Classifier , 회귀문제의 경우 -Regressor 형식
#ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
#linear_model
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import GammaRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression # 분류
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import SGDClassifier # 분류
#neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsTransformer
from sklearn.neighbors import NearestNeighbors
#svm
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
from sklearn.svm import OneClassSVM
from sklearn.svm import SVR # regression
from sklearn.svm import SVC # classfier
#tree
from sklearn.tree import BaseDecisionTree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import ExtraTreeRegressor