import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
file_path = '서울시_기간별_시간평균_대기환경_정보_2020.04.csv'
df = pd.read_csv(file_path, encoding="cp949")
df['측정일시'] = pd.to_datetime(df['측정일시'], format='%Y%m%d%H%M')

# 서초구 valid df로 저장
df.loc[df['측정소명']=='서초구'].sort_values("측정일시")\
.to_csv("valid_df.csv",encoding='utf-8-sig',index=False)
# 학습데이터 준비
filtered_df= df.loc[df['측정소명']!='서초구']
# 지역마다 측정일시 sort해서 stack 쌓기
bowl_df = pd.DataFrame()

for region in filtered_df['측정소명'].unique():
  bowl_df = pd.concat([bowl_df, df.loc[df['측정소명']==region].sort_values("측정일시")],axis = 0)

final_df = bowl_df.reset_index(drop= True)

from sklearn.preprocessing import LabelEncoder

for col in final_df.select_dtypes(include = object).columns:
  final_df[col] = LabelEncoder().fit_transform(final_df[col])

X = final_df[['권역명', '측정소명','미세먼지 24시간(㎍/㎥)', '초미세먼지(㎍/㎥)', '오존(ppm)',
              '이산화질소농도(ppm)', '일산화탄소농도(ppm)', '아황산가스농도(ppm)']]
y = final_df[['측정소명', '미세먼지 1시간(㎍/㎥)']]

X
def seq_data(x, y, sequence_length):
  
  for i in range(len(x) - sequence_length):
    x_seq.append(x[i: i+sequence_length])
    y_seq.append(y[i+sequence_length])
split = 200

x_seq = []
y_seq = []

# rnn state 몇개로 설정할건지, 5시간으로 6시간 예측
sequence_length = 5

for region in X['측정소명'].unique():
  seq_data(X.loc[X['측정소명']==region].values, y.loc[y['측정소명']==region, '미세먼지 1시간(㎍/㎥)'].values , sequence_length)
x_seq, y_seq =  torch.FloatTensor(x_seq).to(device), torch.FloatTensor(y_seq).view([-1, 1]).to(device)

x_train_seq = x_seq[:split]
y_train_seq = y_seq[:split]
x_test_seq = x_seq[split:]
y_test_seq = y_seq[split:]
print(x_train_seq.size(), y_train_seq.size())
print(x_test_seq.size(), y_test_seq.size())
train = TensorDataset(x_train_seq, y_train_seq)
test = TensorDataset(x_test_seq, y_test_seq)

batch_size = 16
train_loader = DataLoader(dataset=train, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test, batch_size=batch_size, shuffle=False)
# 피처 갯수
input_size = x_seq.size(2)

# RNN 2층
num_layers = 2

# RNN hidden cell 1개 4노드
hidden_size = 4
class VanillaRNN(nn.Module):
  def __init__(self, input_size, hidden_size, sequence_length, num_layers, device):
    super(VanillaRNN, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Sequential(nn.Linear(hidden_size * sequence_length, 1))

  def forward(self, x):

    h0 = torch.zeros(self.num_layers, x.size()[0], self.hidden_size).to(device)
    out, _ = self.rnn(x, h0)
    out = out.reshape(out.shape[0], -1)
    out = self.fc(out)
    return out

model = VanillaRNN(input_size=input_size,
                   hidden_size=hidden_size,
                   sequence_length=sequence_length,
                   num_layers=num_layers,
                   device=device).to(device)
criterion = nn.MSELoss()

lr = 1e-3
num_epochs = 200
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_graph = [] # 그래프 그릴 목적인 loss.
n = len(train_loader)

for epoch in range(num_epochs):
  running_loss = 0.0

  for data in train_loader:

    seq, target = data # 배치 데이터.
    #print(seq.shape) # 16, 5, 4

    out = model(seq)   # 모델에 넣고,
    loss = criterion(out, target) # output 가지고 loss 구하고,

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    running_loss += loss.item() # 한 배치의 loss 더해주고,

  loss_graph.append(running_loss / n) # 한 epoch에 모든 배치들에 대한 평균 loss 리스트에 담고,
  if epoch % 10 == 0:
    print('[epoch: %d] loss: %.4f'%(epoch, running_loss/n))

+ Recent posts