library(tidyverse)
= c("tidyverse", "tidytext", "tensorflow", "keras", "caret", "data.table",
pkgs "googleLanguageR", "cld2","tidyverse", "datasets", "ggplot2", "tictoc")
for (pkg in pkgs){
if(!require(pkg, character.only = TRUE)) install.packages(pkg)
library(pkg, character.only = TRUE)
}
22 지도학습-setence (문장 인식)
= readRDS("db/mt2.rds")
mt2 = mt2 %>%
mt3 # 몇
mutate(code = code2) %>%
select(id, code, word = word2)
= mt3 %>%
code_inc select(id, code) %>% unique() %>%
group_by(code) %>%
count() %>%
arrange(desc(n)) %>%
filter(n>1500) %>%
pull(code)
length(code_inc)
= 2000
wordnumber = mt3 %>%
wordData filter(code %in% code_inc) %>%
group_by(word) %>%
count() %>%
arrange(desc(n)) %>%
ungroup() %>%
slice(1:wordnumber) %>%
mutate(wordid = row_number()) %>%
select(-n)
wordData= mt3 %>% filter(code %in% code_inc) %>%
mt4 left_join(wordData, by = c("word")) %>%
na.omit() %>%
ungroup() %>%
mutate(code = as.numeric(code))
## code lkup 을 통해 code를 1부터 순서대로 만든 code_nums로 트레이닝 시키기
= mt4 %>%
code_lkup select(code) %>% unique() %>%
mutate(code_nums = row_number())
= mt4 %>%
mt5 filter(id != "W615000") %>%
left_join(code_lkup, by=c("code"))
# data 만들기
library(parallel)
= mt5 %>% select(id, code_nums) %>% unique()
gg nrow(gg)
$code_nums
ggset.seed(2022)
= createDataPartition(gg$code_nums, p =0.7,
trainIndex list = FALSE,
times = 1)
= trainIndex %>% as.numeric()
index = gg[ index, ]
training_pre = gg[-index, ]
testing_pre = training_pre %>% pull(id) %>% unique()
train_index = testing_pre %>% pull(id) %>% unique()
test_index = mt5 %>% filter(id %in% train_index)
training = mt5 %>% filter(id %in% test_index)
testing $code %>% unique()
training%>% group_by(code_nums) %>% count() %>%
training left_join(testing %>% group_by(code_nums) %>% count(),
by= c("code_nums")) %>%
setNames(c("code_nums", "train", "test")) %>%
mutate(prob = test/train)
= training %>% select(id, wordid)
trainDX = training %>% select(id, code_nums) %>% unique()
trainDY #trainDY %>%
# group_by(id) %>%
# count() %>%
# arrange(desc(n))
= testing %>% select(id, wordid)
testDX = testing %>% select(id, code_nums) %>% unique()
testDY = trainDY %>% pull(code_nums)
trainY = testDY %>% pull(code_nums)
testY %>% unique() %>% length(.) trainY
# list 로 변환
library(parallel)
= list()
trainMX = function(i){trainDX %>% filter(id == i) %>% pull(wordid)}
trainF = mclapply(train_index, trainF, mc.cores = 40)
trainMX length(trainMX) == nrow(trainDY)
#saveRDS(trainMX, "db/trainMX.rds")
#trainMX = readRDS("db/trainMX.rds")
trainMX= list()
testMX = function(i){testDX %>% filter(id == i) %>% pull(wordid)}
testF = mclapply(test_index, testF, mc.cores = 40)
testMX
= function(mx){
sequencingF = matrix(0, nrow=length(mx), ncol= 2000)
jj for (i in 1:length(mx))
<-1
jj[i, mx[[i]]] return(jj)
}
# one-hot encode to categories
library(reticulate)
library(tensorflow)
use_virtualenv("/home/sehnr/tensorflow/tensorvenv", required = TRUE)
= sequencingF(trainMX)
Xtrain = sequencingF(testMX)
Xtest = to_categorical(trainY) # Ytrain
set.seed(2022)
nrow(Xtrain)
<- sample(c(1:c(Xtrain %>% nrow())), c(Xtrain %>% nrow())/10)
val_indices <- Xtrain[val_indices,]
x_val <- Xtrain[-val_indices,]
partial_x_train <- Ytrain[val_indices,]
y_val = Ytrain[-val_indices,] partial_y_train
library(keras)
::k_clear_session()
keras#ref: https://medium.com/@cmukesh8688/activation-functions-sigmoid-tanh-relu-leaky-relu-softmax-50d3778dcea5
<- keras_model_sequential() %>%
model layer_dense(units = 18, activation = "relu", input_shape = c(wordnumber)) %>%
layer_dropout(rate = 0.4) %>% # layers to avoid overfitting.
layer_dense(units = 72, activation = "relu") %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 36, activation = "sigmoid") %>%
layer_dense(units = 18, activation = "softmax")
model
%>% compile(
model optimizer = "adam",
loss = "categorical_crossentropy",
metrics = c("accuracy"))
<- model %>% fit(
history
partial_x_train,
partial_y_train,epochs = 10,
batch_size = 50,
#validation_split=0.2,
validation_data = list(x_val, y_val)
)