1차 report(원데이터 사용)

데이터분석

In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from tensorflow import keras
from scipy.interpolate import interp1d
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras import backend as K
import pickle
from sklearn.model_selection import train_test_split
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
# 한글 폰트 설정
font_path = "C:/Windows/Fonts/malgun.ttf"  # 한글 폰트 경로
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)

데이터 load, 시트간 merge

In [2]:

def load():
        # new_data 들어오면 기존 df 에 합치면 됨
        data = pd.read_excel("../data/TSB_data.xlsx", sheet_name='야드크레이인_작업이력')
        scd_data = pd.read_excel("../data/TSB_data.xlsx", sheet_name='반출입_예정컨테이너')
        cbd_data = pd.read_excel("../data/TSB_data.xlsx", sheet_name='장치장_전')
        cad_data = pd.read_excel("../data/TSB_data.xlsx", sheet_name='장치장_후')
        quay_work_data = pd.read_excel("../data/TSB_data.xlsx", sheet_name='본선크레인_작업이력')

        # data, container_before_data, container_after_data merge
        ycb_common_values = data['컨테이너번호'].isin(cad_data['컨테이너번호']).sum() # 6103개
        yard_con_common_df = pd.merge(data, cad_data, on='컨테이너번호')
        return yard_con_common_df
yard_con_common_df = load()

데이터 전처리(날짜 - datetime, 문자열 - 숫자)

In [3]:

def preprocessing(common_df):
        # 데이터 전처리
        common_df['작업코드'] = common_df['작업코드'].replace({'VU': 1, 'VL': 2, 'GR': 3, 'GD': 4, 'TM':5,'TS':6})
        common_df['장비번호'] = common_df['장비번호'].replace({'Y02': 1})
        common_df['풀(F)공(M)'] = common_df['풀(F)공(M)'].replace({'M':1, 'F':2})
        common_df['수출/수입'] = common_df['수출/수입'].replace({'X':1,'I':2,'S':3,'M':4})
        
        # 외부트럭에 1 넣기
        common_df['야드트럭(번호)'] = common_df['야드트럭(번호)'].fillna(1)
    
        # 시간 타입 통합
        common_df['작업생성시간'] = pd.to_datetime(common_df['작업생성시간'], format='%Y%m%d%H%M%S')
        common_df['작업완료시간'] = pd.to_datetime(common_df['작업완료시간'], format='%Y%m%d%H%M%S')
        #print('작업생성시간',common_df['작업생성시간'].dtype)
        #print('작업완료시간',common_df['작업완료시간'].dtype)
        common_df['작업+대기시간'] = common_df['작업완료시간'] -common_df['작업생성시간']
        # common_df = common_df[-200:]

        # 작업+대기차량 구하기
        common_df['작업+대기차량'] = 0
        # 행 인덱스를 역순으로 순회
        for i in range(len(common_df) - 1, -1, -1):
            # i번째 행의 작업생성시간
            creation_time = common_df.loc[i, '작업생성시간']
            # i번째 행보다 앞에 있는 행들 중에서 작업완료시간이 아직 지나지 않은 것들의 수를 계산
            count = 0
            for j in range(i - 1, -1, -1):
                if common_df.loc[j, '작업완료시간'] > creation_time:
                    count += 1
            
            # 계산한 수를 i번째 행의 '대기차량' 열에 저장
            common_df.loc[i, '작업+대기차량'] = count
        print(common_df[['작업+대기시간', '작업+대기차량']])

        # 작업+대기시간을 초로 변환
        common_df['작업+대기시간'] = common_df['작업+대기시간'].dt.total_seconds() /60.0
        common_df['풀(F)공(M)'] = common_df['풀(F)공(M)'].astype('int64')
        common_df = common_df.dropna(subset=['작업+대기시간'])
        common_df_complete = common_df
        common_df_complete.sort_values('작업생성시간', inplace=True)
        #print(common_df_complete[['작업생성시간','작업+대기시간']])


        common_df_complete = common_df_complete[['작업생성시간','작업+대기시간', '작업코드', '장비번호', '풀(F)공(M)', '수출/수입', '컨테이너(사이즈 코드)', '작업+대기차량']]
        common_df_complete['입차시간'] = common_df_complete['작업생성시간']
        #print(common_df_complete['작업+대기시간'])
        grouped_df = common_df_complete.groupby(pd.Grouper(key='작업생성시간', freq='5min')).mean()
        #print('grouped_df',grouped_df['작업+대기시간'])
        grouped_df['입차시간'] = grouped_df['입차시간'].astype('int64') // 10**9
        return grouped_df, common_df
grouped_df, common_df = preprocessing(yard_con_common_df)

             작업+대기시간  작업+대기차량
0    0 days 00:07:13        0
1    0 days 00:06:21        1
2    0 days 00:09:29        2
3    0 days 00:04:43        3
4    0 days 00:05:00        0
...              ...      ...
6098 0 days 00:20:39      139
6099 0 days 00:05:00       30
6100 0 days 00:05:00       26
6101 0 days 00:21:38      144
6102 0 days 00:05:00       26

[6103 rows x 2 columns]

C:\Users\SW\AppData\Local\Temp\ipykernel_7452\1394065981.py:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_df_complete.sort_values('작업생성시간', inplace=True)
C:\Users\SW\AppData\Local\Temp\ipykernel_7452\1394065981.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_df_complete['입차시간'] = common_df_complete['작업생성시간']

데이터 상관관계 분석

In [4]:

# data, quay merge
common_df_truck_in = common_df[common_df['야드트럭(번호)'] != 1]
print(common_df_truck_in.shape)
common_df_truck_out = common_df[common_df['야드트럭(번호)'] == 1]
print(common_df_truck_out.shape)
correlation_matrix = common_df_truck_out[['작업코드','항차_x','야드트럭(번호)','컨테이너(사이즈 코드)','장비번호', '작업생성시간','작업완료시간', '풀(F)공(M)', '수출/수입','작업+대기시간']].corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

(2000, 42)
(4101, 42)

d:\김형찬\P6_disabled\P6_EmpCont\venv\lib\site-packages\seaborn\utils.py:80: UserWarning: Glyph 8722 (\N{MINUS SIGN}) missing from current font.
  fig.canvas.draw()
d:\김형찬\P6_disabled\P6_EmpCont\venv\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 8722 (\N{MINUS SIGN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

LSTM 예측 모델 생성

In [5]:

def make_model(grouped_df):
    lookback = 30

    # 데이터 전처리
    X_data = grouped_df[['입차시간','작업+대기시간', '작업코드', '풀(F)공(M)','수출/수입','장비번호', '컨테이너(사이즈 코드)', '작업+대기차량']]
    y_data = grouped_df['작업+대기시간'].values

    X, y = [], []
    for i in range(grouped_df.shape[0] - lookback):
        X.append(X_data[i:i+lookback].values)
        y.append(y_data[i+lookback])
    X = np.array(X)
    y = np.array(y)

    # 시계열이라 데이터를 랜덤하게 분할하지 않고 시간대에 따라 분할
    train_size = int(len(X) * 0.8)

    # 시간 순서를 유지하면서 데이터 분할
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    print(X_train.shape)
    # 데이터 정규화
    scaler = MinMaxScaler()
    X_train_scaled = np.zeros_like(X_train)
    X_test_scaled = np.zeros_like(X_test)

    for i in range(X_train.shape[2]):
        X_train_scaled[:, :, i] = scaler.fit_transform(X_train[:, :, i])
        X_test_scaled[:, :, i] = scaler.transform(X_test[:, :, i])

    # y_train, y_test 스케일링
    scaler_y = MinMaxScaler()
    y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
    y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))

    # LSTM 모델 구성
    model = keras.Sequential()
    model.add(keras.layers.LSTM(units=64, input_shape=(lookback, X_train.shape[-1])))
    model.add(keras.layers.Dense(units=64, activation='relu'))
    model.add(keras.layers.Dense(units=32, activation='relu'))
    model.add(keras.layers.Dense(units=1))

    # 모델 컴파일
    model.compile(loss='mean_squared_error', optimizer='adam')

    # 모델 학습
    model.fit(X_train_scaled, y_train_scaled, epochs=100, batch_size=32)

    # 모델 예측
    y_train_pred_scaled = model.predict(X_train_scaled)
    y_test_pred_scaled = model.predict(X_test_scaled)

    # 예측값을 원래의 스케일로 되돌리기
    y_train_pred = scaler_y.inverse_transform(y_train_pred_scaled)
    y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled)

    # 원래의 스케일로 되돌린 실제값
    y_train_real = scaler_y.inverse_transform(y_train_scaled)
    y_test_real = scaler_y.inverse_transform(y_test_scaled)

    time_index = 0
    # '입차시간' 데이터 추출
    X_train_time = X_train[:, -1:, time_index].reshape(-1)
    X_test_time = X_test[:, -1:, time_index].reshape(-1)
    # '입차시간' 데이터 원래대로 되돌리기
    X_train_time_original = pd.to_datetime(X_train_time * 10**9)
    X_test_time_original = pd.to_datetime(X_test_time * 10**9)

    combined_time = np.concatenate((X_test_time_original, X_train_time_original), axis=0)
    combined_pred = np.concatenate((y_test_pred, y_train_pred), axis=0)
    combined_real = np.concatenate((y_test_real, y_train_real), axis=0)
    # datetime 형식을 리스트로 바꾸면 유닉스타임 스탬프로 변경돼서 다른 방법 써야 함
    # time = combined_time.tolist()
    datetime_list = pd.to_datetime(combined_time.tolist(), unit='ns')
    actual_values = combined_real.tolist()
    predict_values = combined_pred.tolist()
    
    return datetime_list, actual_values, predict_values, y_test_pred, y_train_pred, y_test_real, y_train_real
datetime_list, actual_values, predict_values, y_test_pred, y_train_pred, y_test_real, y_train_real = make_model(grouped_df)

예측값, 실제값 그래프 생성

In [6]:

def make_graph(datetime_list, actual_values, predict_values):
# 그래프의 크기 설정
    plt.figure(figsize=(14, 7))
    plt.scatter(datetime_list, actual_values, color='blue', label='Actual values')
    plt.scatter(datetime_list, predict_values, color='red', label='Predicted values')
    plt.xlabel('Time')
    plt.ylabel('Values')
    plt.title('Scatter plot of actual and predicted values over time')
    plt.legend()
    graph_image_filename = "lstm_graph.png"
    plt.savefig(graph_image_filename)
    print(f"그래프를 '{graph_image_filename}' 파일로 저장했습니다.")
    plt.show()
make_graph(datetime_list, actual_values, predict_values)

그래프를 'lstm_graph.png' 파일로 저장했습니다.

예측 결과 평가

In [7]:

def valid(y_test_pred, y_train_pred, y_test_real, y_train_real):
# Mean Absolute Error (MAE)
    mae_train = mean_absolute_error(y_train_real, y_train_pred)
    mae_test = mean_absolute_error(y_test_real, y_test_pred)

    print(f'Train MAE: {mae_train}, Test MAE: {mae_test}')

    # Mean Squared Error (MSE)
    mse_train = mean_squared_error(y_train_real, y_train_pred)
    mse_test = mean_squared_error(y_test_real, y_test_pred)

    print(f'Train MSE: {mse_train}, Test MSE: {mse_test}')

    # Root Mean Squared Error (RMSE)
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)

    print(f'Train RMSE: {rmse_train}, Test RMSE: {rmse_test}')

    # R^2 Score
    r2_train = r2_score(y_train_real, y_train_pred)
    r2_test = r2_score(y_test_real, y_test_pred)

    print(f'Train R^2: {r2_train}, Test R^2: {r2_test}')
valid(y_test_pred, y_train_pred, y_test_real, y_train_real)

Train MAE: 1.0333753771606173, Test MAE: 0.6686029493159418
Train MSE: 1.8150934769599587, Test MSE: 0.6387612116257888
Train RMSE: 1.3472540506377997, Test RMSE: 0.7992253822457022
Train R^2: 0.43072415920788165, Test R^2: 0.23512287388724862

IT Together

1차 report(원데이터 사용)

데이터분석

티스토리툴바