# AI lib - sklearn
# xxxxClassfier(분류) - 평가지표(정확도, 정밀도, 재현율, 조화평균)
# xxxxRegression (예측/회귀)

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt


import sklearn
from sklearn.datasets import load_iris, load_breast_cancer, fetch_openml

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error

from sklearn.neighbors import KNeighborsClassifier , KNeighborsRegressor
from sklearn.linear_model import LogisticRegression , LinearRegression, Ridge, Lasso

#비지도학습
from sklearn.cluster import KMeans


from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, Binarizer, PolynomialFeatures


print('sklearn version', sklearn.__version__)

sklearn version 1.2.2


perch_length = np.array([8.4, 13.7, 15.0, 16.2, 17.4, 18.0, 18.7, 19.0, 19.6, 20.0, 21.0,
       21.0, 21.0, 21.3, 22.0, 22.0, 22.0, 22.0, 22.0, 22.5, 22.5, 22.7,
       23.0, 23.5, 24.0, 24.0, 24.6, 25.0, 25.6, 26.5, 27.3, 27.5, 27.5,
       27.5, 28.0, 28.7, 30.0, 32.8, 34.5, 35.0, 36.5, 36.0, 37.0, 37.0,
       39.0, 39.0, 39.0, 40.0, 40.0, 40.0, 40.0, 42.0, 43.0, 43.0, 43.5,
       44.0])
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
       115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
       150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
       218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
       556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
       850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
       1000.0])


print('데이터가 어떤 형태를 띄고 있는지 시각화 - ')

print('결론 : 농어의 길이가 클수록 무게는 증가하는 경향이 있다.')
plt.scatter(perch_length, perch_weight)
plt.show()
plt.close()

데이터가 어떤 형태를 띄고 있는지 시각화 - 
결론 : 농어의 길이가 클수록 무게는 증가하는 경향이 있다.


print('우리는 무게를 예측, 그런데 무게에 영향을 미치는 요소가 길이가 되는 것')
print('학습에 필요한 데이터세트는 2차원 - 즉, 차원변경이 필요하다 ')
X_train, X_test, y_train, y_test = train_test_split(perch_length,
                                                    perch_weight,
                                                    test_size = 0.2,
                                                    shuffle = True,
                                                    random_state = 100 )

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

우리는 무게를 예측, 그런데 무게에 영향을 미치는 요소가 길이가 되는 것
학습에 필요한 데이터세트는 2차원 - 즉, 차원변경이 필요하다 
(44,) (12,) (44,) (12,)


X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(44, 1) (12, 1) (44,) (12,)


print('과대적합 - 학습데이터에 충실한 결과 테스트데이터의 정확도가 낮은 경우 ')
print('과소적합 - 테스트데이터의 정확도가 학습데이터의 정확도보다 높은 경우 ')

knn_regression_model = KNeighborsRegressor()
knn_regression_model.fit(X_train, y_train)
print(knn_regression_model.score(X_test, y_test))
print(knn_regression_model.score(X_train, y_train))

과대적합 - 학습데이터에 충실한 결과 테스트데이터의 정확도가 낮은 경우 
과소적합 - 테스트데이터의 정확도가 학습데이터의 정확도보다 높은 경우 
0.9761124130689229
0.9749133450418946


print('그래서, 회귀의 경우 결정계수(R^2) 평가를 한다. ')
y_predict = knn_regression_model.predict(X_test)

print('predict - ', y_predict[0])
print('target - ', y_test[0])
print('data - ', X_test[0])

mse = mean_squared_error(y_test, y_predict)
mae = mean_absolute_error(y_test, y_predict)
print('mse - ', mse)
print('mae -', mae)
print('rmse - ', np.sqrt(mse))

그래서, 회귀의 경우 결정계수(R^2) 평가를 한다. 
predict -  258.6
target -  250.0
data -  [27.5]
mse -  3576.4708333333333
mae - 34.225
rmse -  59.80360217690347


print('mse - 모델이 예측한 값과 실제값 차이의 면적을 제곱 - 작으면 작을수록 성능이 좋다')
print('mae - 모델이 예측한 값과 실제값 차이의 절대값의 평균(가장 직관적인 지표가 된다.) ')
print()
print('rmse - mse에 루트를 씌운 것 (오류지표가 실제값과 유사한 단위로 다시 변환이 되는 것)')

mse - 모델이 예측한 값과 실제값 차이의 면적을 제곱 - 작으면 작을수록 성능이 좋다
mae - 모델이 예측한 값과 실제값 차이의 절대값의 평균(가장 직관적인 지표가 된다.) 

rmse - mse에 루트를 씌운 것 (오류지표가 실제값과 유사한 단위로 다시 변환이 되는 것)


print('unseen data - 사용자가 입력하는 데이터로 모델이 학습 또는 테스트 시 보지못한 데이터를 의미한다.')
print('unseen data -', knn_regression_model.predict([[50]]))

unseen data - 사용자가 입력하는 데이터로 모델이 학습 또는 테스트 시 보지못한 데이터를 의미한다.
unseen data - [974.]


print('회귀란? - 기울기와 절편의 값을 학습을 통해서 조절해나가는 방식으로 오차를 줄이는 것')
print('y= (w*x) + bias')

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_predict = linear_model.predict(X_test)

print('data - ', X_test[0])
print('answer -', y_test[0])
print('guess -', y_predict[0])

회귀란? - 기울기와 절편의 값을 학습을 통해서 조절해나가는 방식으로 오차를 줄이는 것
y= (w*x) + bias
data -  [27.5]
answer - 250.0
guess - 357.1745778121581


print('w = 기울기 - ', linear_model.coef_)
print('b = 절편 - ', linear_model.intercept_)
print((38.0863622 * 27.5) + (-690.200382743657))

w = 기울기 -  [38.0863622]
b = 절편 -  -690.200382743657
357.17457775634307


print('R^2 결정계수 - ', linear_model.score(X_test, y_test))
print('R^2 결정계수 - ', linear_model.score(X_train, y_train))

R^2 결정계수 -  0.8824139293666515
R^2 결정계수 -  0.9298154590975974


print('모델 시각화 -')

plt.scatter(X_train, y_train)
plt.plot([5,50],[(5*linear_model.coef_) + linear_model.intercept_,(50 *linear_model.coef_) + linear_model.intercept_ ] )

plt.show()
plt.close()

모델 시각화 -


plt.scatter(X_test, y_test)
plt.plot([5,50],[(5*linear_model.coef_) + linear_model.intercept_,(50 *linear_model.coef_) + linear_model.intercept_ ] )

plt.show()
plt.close()


print('최적의 직선보다 최적의 곡선을 찾으면 어떨까? - ')
print('다항회귀 poly - 2차방정식 - 항을 추가하는 것(길이를 제곱하는) ')
print('y = (w1 * x1) + (w2 * x2) + bias')

X_train_poly = np.column_stack((X_train **2, X_train))
X_test_poly = np.column_stack((X_test **2, X_test))
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

최적의 직선보다 최적의 곡선을 찾으면 어떨까? - 
다항회귀 poly - 2차방정식 - 항을 추가하는 것(길이를 제곱하는) 
y = (w1 * x1) + (w2 * x2) + bias
(44, 1) (12, 1) (44,) (12,)


poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_predict = poly_model.predict(X_test_poly)

print('w = 기울기 - ', poly_model.coef_)
print('b = 절편 - ', poly_model.intercept_)

w = 기울기 -  [  1.10433336 -26.95714713]
b = 절편 -  188.83852627979167


print('data - ', X_test[0])
print('guess -', y_predict[0])
print('model - ', y_test[0])

poly_model.predict([[50**2, 50]])

data -  [756.25  27.5 ]
guess - 282.669084343809
model -  250.0

array([1601.81457208])


print('모델 시각화 -')

plt.scatter(X_train, y_train)
point = np.arange(5,50)
plt.plot(point,((point **2)*poly_model.coef_[0])+(point*poly_model.coef_[1]) + poly_model.intercept_)

plt.show()
plt.close()

모델 시각화 -


print('R^2 결정계수 - ', poly_model.score(X_test_poly, y_test))
print('R^2 결정계수 - ', poly_model.score(X_train_poly, y_train))

R^2 결정계수 -  0.9724712713069817
R^2 결정계수 -  0.9718898579448155


print('다중회귀 - 독립변수가 여러개인 데이터 세트를 학습시키는 모델 - ')
perchFrm = pd.read_csv('https://bit.ly/perch_csv_data')
perchFrm = perchFrm.to_numpy()
print('type - ', type(perchFrm))

다중회귀 - 독립변수가 여러개인 데이터 세트를 학습시키는 모델 - 
type -  <class 'numpy.ndarray'>


#독립변수의 수 : 3
#perchFrm


X_train, X_test, y_train, y_test = train_test_split(perchFrm,
                                                    perch_weight,
                                                    test_size = 0.2,
                                                    shuffle = True,
                                                    random_state = 100 )

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(44, 3) (12, 3) (44,) (12,)


poly = PolynomialFeatures()
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

print(X_train_poly.shape, X_test_poly.shape, y_train.shape, y_test.shape)

(44, 10) (12, 10) (44,) (12,)


print('추가된 독립변수의 특성을 확인 - ')
print(poly.get_feature_names_out())

추가된 독립변수의 특성을 확인 - 
['1' 'x0' 'x1' 'x2' 'x0^2' 'x0 x1' 'x0 x2' 'x1^2' 'x1 x2' 'x2^2']


poly_linear_model = LinearRegression()
poly_linear_model.fit(X_train_poly, y_train)
print('train 결정계수 - ', poly_linear_model.score(X_train_poly, y_train))
print('test 결정계수 - ', poly_linear_model.score(X_test_poly, y_test))

train 결정계수 -  0.9890186278665611
test 결정계수 -  0.9848123779621021


print('규제를 통한 회귀 -')
print('규제한다라는 건 어떤 의미?')
print('선행조건) 규제전에 정규화')
print(X_train_poly[0])

규제를 통한 회귀 -
규제한다라는 건 어떤 의미?
선행조건) 규제전에 정규화
[  1.      28.7      7.59     4.64   823.69   217.833  133.168   57.6081
  35.2176  21.5296]


scaler = StandardScaler()
scaler.fit(X_train_poly)
X_train_scaler = scaler.transform(X_train_poly)
X_test_scaler = scaler.transform(X_test_poly)


X_train_scaler[0]

array([ 0.        ,  0.0295517 , -0.16078353, -0.11150082, -0.11277996,
       -0.21862473, -0.19570287, -0.3076797 , -0.28599055, -0.26627583])


print('릿지 규제 회귀 - 값을 제곱')
print('라쏘 규제 회귀 - 값을 절대값을 기준으로')
poly_linear_model = Lasso()
poly_linear_model.fit(X_train_scaler, y_train)
print()
print('train 결정계수 - ', poly_linear_model.score(X_train_scaler, y_train))
print('test 결정계수 - ', poly_linear_model.score(X_test_scaler, y_test))

릿지 규제 회귀 - 값을 제곱
라쏘 규제 회귀 - 값을 절대값을 기준으로

train 결정계수 -  0.9845858527610626
test 결정계수 -  0.9906020044140663


tmpFrm = pd.DataFrame(columns = ('x','y'))
tmpFrm.loc[0] = [7,1]
tmpFrm.loc[1] = [2,1]
tmpFrm.loc[2] = [4,2]
tmpFrm.loc[3] = [9,4]
tmpFrm.loc[4] = [10,5]
tmpFrm.loc[5] = [10,6]
tmpFrm.loc[6] = [11,5]
tmpFrm.loc[7] = [11,6]
tmpFrm.loc[8] = [15,3]
tmpFrm.loc[9] = [16,5]
tmpFrm.loc[10] = [16,6]
tmpFrm.loc[11] = [16,1]
tmpFrm


plt.figure(figsize=(15,5))
plt.scatter(tmpFrm['x'], tmpFrm['y'])
plt.show()
plt.close()


cluster_model  = KMeans(n_clusters=3 , random_state =100 )
cluster_model.fit(tmpFrm.values)

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

KMeans(n_clusters=3, random_state=100)

KMeans(n_clusters=3, random_state=100)


# cluster_model.labels_
cluster_model.cluster_centers_

array([[5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])


sns.lmplot(x = 'x', y = 'y' , data = tmpFrm , hue = 'cluster_id' , fit_reg = False)

<seaborn.axisgrid.FacetGrid at 0x7d7966dcbf40>


from sklearn.datasets import make_blobs


x, y = make_blobs(n_samples = 300)


plt.scatter(x[: , 0] , x[ : , 1] , marker = 'o')
plt.show()
plt.close()


# feature, label = make_blobs(n_samples= 300)
# cluster_model = KMeans(n_clusters=3)
# cluster_model.fit(feature)


plt.scatter(feature[: , 0] ,feature[ : , 1] , marker = 'o' , c = cluster_model.labels_)
plt.scatter(cluster_model.cluster_centers_[: , 0] ,
            cluster_model.cluster_centers_[ : , 1] , marker = '^' , c = ['r', 'g', 'b'])

plt.show()
plt.close()


iris = load_iris()


irisFrm = pd.DataFrame(data = iris.data ,
                       columns = iris.feature_names)
irisFrm


cluster_model = KMeans(n_clusters=3 , max_iter = 300 )
cluster_model.fit(irisFrm.values)

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

KMeans(n_clusters=3)

KMeans(n_clusters=3)


print('labels  - ' , cluster_model.labels_)
print('center  - ')
print(cluster_model.cluster_centers_)

labels  -  [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]
center  - 
[[6.85       3.07368421 5.74210526 2.07105263]
 [5.006      3.428      1.462      0.246     ]
 [5.9016129  2.7483871  4.39354839 1.43387097]]


irisFrm['cluster_id'] = cluster_model.labels_
irisFrm['target']     = iris.target
irisFrm


irisFrm.groupby(['target' , 'cluster_id'])['sepal length (cm)'].count()

target  cluster_id
0       1             50
1       0              2
        2             48
2       0             36
        2             14
Name: sepal length (cm), dtype: int64

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	cluster_id	target
0	5.1	3.5	1.4	0.2	1	0
1	4.9	3.0	1.4	0.2	1	0
2	4.7	3.2	1.3	0.2	1	0
3	4.6	3.1	1.5	0.2	1	0
4	5.0	3.6	1.4	0.2	1	0
...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	0	2
146	6.3	2.5	5.0	1.9	2	2
147	6.5	3.0	5.2	2.0	0	2
148	6.2	3.4	5.4	2.3	0	2
149	5.9	3.0	5.1	1.8	2	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	cluster_id	target
0	5.1	3.5	1.4	0.2	1	0
1	4.9	3.0	1.4	0.2	1	0
2	4.7	3.2	1.3	0.2	1	0
3	4.6	3.1	1.5	0.2	1	0
4	5.0	3.6	1.4	0.2	1	0
...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	0	2
146	6.3	2.5	5.0	1.9	2	2
147	6.5	3.0	5.2	2.0	0	2
148	6.2	3.4	5.4	2.3	0	2
149	5.9	3.0	5.1	1.8	2	2

코랩을 이용한 머신러닝 기초 day03

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	cluster_id	target
0	5.1	3.5	1.4	0.2	1	0
1	4.9	3.0	1.4	0.2	1	0
2	4.7	3.2	1.3	0.2	1	0
3	4.6	3.1	1.5	0.2	1	0
4	5.0	3.6	1.4	0.2	1	0
...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	0	2
146	6.3	2.5	5.0	1.9	2	2
147	6.5	3.0	5.2	2.0	0	2
148	6.2	3.4	5.4	2.3	0	2
149	5.9	3.0	5.1	1.8	2	2