Chapter 7 IPTW

Summary of TWANG packages’s tutorial of https://cran.r-project.org/web/packages/twang/vignettes/iptw.pdf Always thanks to researcher for making such nice tutorial for me!!!. Now, I use Korean for me haha

7.1 package tutorial

twang, survey 라이브러리를 불러옴.

rm(list=ls())
pkgs = c("twang", "survey","tidyverse")
for (pkg in pkgs) {
  if (pkg %in% rownames(installed.packages()) == FALSE)
  {eval( bquote(install.packages(.(pkg))) )}
  else 
  {eval(bquote(library(.(pkg))))}  
}

실습데이터 iptwExWide를 불러옴.

data("iptwExWide")

Time varying value(TV)와 Treatment가 있고 서로 상관이 있는 상태에서 outcome을 예측해 봅니다. 이때 시간 \(t_i\) 가 \(t_{i+1}\)에 영향을 미친다고 가정하면, use0로 인해 tx1이 바뀔수 있는 상태입니다. 즉 \(Outcome_i \sim \beta_1 T_i + \beta_2 U_i + \epsilon\) 으로 예측하면 \(T_i \sim U_i\) 라는, 즉 쓸만해서 약을 썼다는 것을 조절하지 못하게 됩니다. 따라서 쓸만해서 약을 쓰는 사람 중 비슷한 사람을 매칭하는 방법을 사용할 수 있습니다. 이때 propensity score matching방법을 이용하여 계산된 weighting 값을 이용해서 컨트롤 해주겠다는 의미 정도입니다.

첫 baseline 을 t = 0로 하고 3번 추적되었다고 하면 아래와 같이 식을 만들수 있습니다.

iptw.Ex1 <- iptw(list(tx1 ~ use0 + gender + age,
 tx2 ~ use1 + use0 + tx1 + gender + age,
 tx3 ~ use2 + use1 + use0 + tx2 + tx1 + gender + age),
 timeInvariant ~ gender + age,
 data = iptwExWide,
 cumulative = FALSE,
 priorTreatment = FALSE,
 verbose = FALSE,
 stop.method = "es.max",
 n.trees = 5000)

이제 weighting 을 구하고 이용하여 Treatment effect를 산출해 보겠습니다. 이게 무슨 뜻일까?(20120707) " Because the weights often have substantial variation, the weights are commonly stabilized where the standard inverse probability of treatment weights are multiplied by the estimated probability of receiving the treatment that each individual received, conditioning only on previous periods’ treatment indicators."

우선 표준화된 가중치를 구하고, 이후 이전 treatment가 지금의 treatment에 주는 영향을 고려한 값을 곱해서 stablized weights 를 산출한다는 뜻인듯.

우선 number of periods of treatment 가 outcome에 영향을 준다고 가정하고 통계를 돌려보자. 그렇다면, nTx가 outcome에 영향을 미치는데, 이때 iptw을 이용한 가중치를 보정해 주는 것. 이때 conditional regression 이 필요한데, 가중치와 id(cluster)등은 survey package의 svydesign, svyglm을 이용하여 컨트롤함. 먼저 standardized weight를 구함. 이때 get.weights.unstab 함수로 iptw.Ex1에서 3 시간 포인트 별로 구했던 값을 이용하여 가중치를 산출(원리에 대해서는 논문 읽기).

unstabWt1 <- get.weights.unstab(iptw.Ex1)
nTx <- with(iptwExWide, tx1 + tx2 + tx3)
outDatUnstab <- data.frame(outcome = iptwExWide$outcome,
 nTx,
 wt = unstabWt1$es.max.ATE)
sv1unstab <- svydesign(~1, weights = ~wt, data = outDatUnstab)

fitUnstab <- svyglm(outcome ~ nTx, sv1unstab)
coef(fitUnstab)

## (Intercept)         nTx 
##  0.08325034 -0.12798336

이때 nTx 에 따라 -0.12798 의 \(\beta\) 값이 나타남. 이제 여기의 weight 값에 이전 treatment가 지금의 treatment에 주는 영향을 고려한 값을 곱해서 stablized weights 를 산출함.

fitList <- list(glm(tx1 ~ 1, family = binomial, data = iptwExWide),
 glm(tx2 ~ tx1, family = binomial, data = iptwExWide),
 glm(tx3 ~ tx1 + tx2, family = binomial, data = iptwExWide))

numWt <- get.weights.num(iptw.Ex1, fitList)
stabWt1 <- unstabWt1 * numWt

outDatStab <- data.frame(outcome = iptwExWide$outcome,
 nTx,
 wt = stabWt1$es.max.ATE)

이제 survey 방식으로 회귀분석을 하고 해석하면 됨.

sv1stab <- svydesign(~1, weights = ~wt, data = outDatStab)
fitStab <- svyglm(outcome ~ nTx, sv1stab)
cbind(coef(fitStab), confint(fitStab)) [2,]

##                   2.5 %      97.5 % 
## -0.13105300 -0.18662568 -0.07548032

7.2 case study of noise exposure and hypertension

소음에 노출된 사람에서 고혈압이 발생하는가? >load data

dat = readxl::read_excel('data/finaldat_210707.xlsx')
head(dat)

## # A tibble: 6 x 44
##       vo person_id location_id tstart               year exp_noise exp_min
##    <dbl>     <dbl>       <dbl> <dttm>              <dbl> <chr>       <dbl>
## 1 227516        18          14 2014-05-15 00:00:00  2014 yes             1
## 2 255450        18        1554 2015-05-13 00:00:00  2015 yes             1
## 3 227492        22          14 2014-05-15 00:00:00  2014 no              0
## 4 255510        22        1554 2015-05-13 00:00:00  2015 no              0
## 5 280838        22        1554 2016-05-11 00:00:00  2016 no              0
## 6 227463        26        1502 2014-05-15 00:00:00  2014 no              0
## # … with 37 more variables: exp_crdvsc <chr>, exp_night <dbl>, exp_nitro <dbl>,
## #   exp_dcm <dbl>, exp_dcfm <dbl>, exp_dcf <dbl>, exp_anitril <dbl>,
## #   exp_cs2 <dbl>, exp_tce <dbl>, exp_anti <dbl>, exp_cnna <dbl>,
## #   exp_kcn <dbl>, exp_hcn <dbl>, exp_no2 <dbl>, exp_co <dbl>, exp_vib <dbl>,
## #   exp_hipr <dbl>, exp_pb <dbl>, exp_cd <dbl>, age <dbl>, htn <chr>,
## #   sex <chr>, smokh <chr>, agegp <chr>, BMI <chr>, exercise <chr>, dm <chr>,
## #   pr_rgr04 <chr>, bd_rgr3 <dbl>, sum <dbl>, waist_obs <chr>, tstop <dttm>,
## #   htn_oc <chr>, tstart2 <dbl>, tstop2 <dbl>, flag <dbl>, htnoc1st <dbl>

long form이므로 wide 형태로 변형.

2014, 2015, 2016

dat2 = dat # %>% slice(1:17)

dat2 = dat2 %>%
  select(person_id, year,flag, exp_noise, exp_crdvsc, tstart2, tstop2, age, sex, 
         smokh, BMI, exercise, dm, pr_rgr04, sum, htn_oc)

dat3 = dat2 %>%
  mutate(exp_noise=as.numeric(exp_noise == 'yes')) %>%
  #select(person_id, tstart, year, exp_noise, htnoc1st) %>%
  mutate(py = tstop2 - tstart2) %>% select(-c(tstart2, tstop2)) %>%
  mutate_if(is.numeric, as.character, is.factor, as.character) %>%
  pivot_longer(-c(person_id, year, flag), 
               names_to = "variables", 
               values_to ="values", 
               values_transform = list(val = as.character))  %>%
  mutate(variables2= paste0(variables, "_", flag)) %>%
  select(-variables, -flag, -year) %>%
  pivot_wider(names_from = variables2, 
              values_from= values) %>%
  filter(!is.na(htn_oc_3)) %>%
  data.frame()

iptw.Ex =  iptw(list(
  exp_noise_1 ~ BMI_1 + sex_1 + age_1, 
  exp_noise_2 ~ BMI_2 + exp_noise_1 + sex_1 + age_1, 
  exp_noise_3 ~ BMI_3 + BMI_2 + BMI_1 + exp_noise_2 + sex_1 + age_1), 
  timeInvariant ~ sex_1 + age_1, 
  data=dat3, 
  cumulative = FALSE, 
  priorTreatment = FALSE, 
  verbose = FALSE, 
  stop.method = "es.max", 
  n.trees = 1000
  )