Prepare VAERS data for safety signal detection

Nan Xiao https://nanx.me/ (Seven Bridges)https://www.sevenbridges.com/ , Soner Koc https://github.com/skoc (Seven Bridges)https://www.sevenbridges.com/ , Kaushik Ghose https://kaushikghose.wordpress.com/ (Seven Bridges)https://www.sevenbridges.com/
2020-05-17

Table of Contents


This page describes the steps involved to downloaded and preprocess the raw VAERS data, and transform the data into an analyzable format.

Download the data

To download the VAERS data:

As time of the downloading (Feburary 2020), data from all years since 1990 was complete except for 2019. The data for 2019 contains VAERS reports processed as of 2019-12-14.

The captchas on each dataset download page might make it a bit complicated to automate the data retrieval process, especially when 30 years of data needs to be downloaded. Providing an standard, programatically accessible API for retriving the data like the current AERS database would be helpful.

Load the data


library("readr")
library("dplyr")
library("tidyr")

year <- as.character(1990:2019)
k <- length(year)
file_zip <- paste0("data/", year, "VAERSData.zip")
file_meta <- paste0(year, "VAERSDATA.csv")
file_vax <- paste0(year, "VAERSVAX.csv")
file_sym <- paste0(year, "VAERSSYMPTOMS.csv")

lst_meta <- vector("list", k)
lst_vax <- vector("list", k)
lst_sym <- vector("list", k)

for (i in 1:k) {
  cat("Reading VAERS data from year", year[i], "\n")
  con1 <- unz(description = file_zip[i], filename = file_meta[i])
  lst_meta[[i]] <- read_csv(con1, col_types = cols(VAERS_ID = col_character()))

  con2 <- unz(description = file_zip[i], filename = file_vax[i])
  lst_vax[[i]] <- read_csv(con2, col_types = cols(VAERS_ID = col_character()))

  con3 <- unz(description = file_zip[i], filename = file_sym[i])
  lst_sym[[i]] <- read_csv(con3, col_types = cols(VAERS_ID = col_character()))
}

df_meta <- dplyr::bind_rows(lst_meta)
df_vax <- dplyr::bind_rows(lst_vax)
df_sym <- dplyr::bind_rows(lst_sym)

df_meta <- df_meta[, c("VAERS_ID", "AGE_YRS", "SEX")]
df_vax <- df_vax[, c("VAERS_ID", "VAX_NAME")]
df_sym <- df_sym[, c("VAERS_ID", "SYMPTOM1", "SYMPTOM2", "SYMPTOM3", "SYMPTOM4", "SYMPTOM5")]

Quality Control

To ensure the basic data quality, remove all rows with NAs from the metadata table:


df_meta <- df_meta[complete.cases(df_meta), ]

Concatenate columns for symptoms:


df_sym$"SYMPTOM" <- paste(df_sym$SYMPTOM1, df_sym$SYMPTOM2, df_sym$SYMPTOM3, df_sym$SYMPTOM4, df_sym$SYMPTOM5, sep = ", ")

Use the VAERS IDs that are common across all tables as the index:


id <- intersect(intersect(unique(df_meta$VAERS_ID), unique(df_vax$VAERS_ID)), unique(df_sym$VAERS_ID))

Keep only the VAERS IDs shared across all tables:


df_meta <- df_meta[df_meta$VAERS_ID %in% id, ]
df_vax <- df_vax[df_vax$VAERS_ID %in% id, ]
df_sym <- df_sym[df_sym$VAERS_ID %in% id, ]

Squash the same IDs but multiple symptoms or multiple vaccines into one row:


df_vax <- df_vax %>%
  group_by(VAERS_ID) %>%
  arrange(VAX_NAME) %>%
  summarise(VAX_NAMES = paste(VAX_NAME, collapse = ", "))

df_sym <- df_sym %>%
  group_by(VAERS_ID) %>%
  arrange(SYMPTOM) %>%
  summarise(SYMPTOMS = paste(SYMPTOM, collapse = ", "))

Merge tables

Merge all tables


df <- inner_join(inner_join(df_meta, df_vax, by = "VAERS_ID"), df_sym, by = "VAERS_ID")

Separate the rows with multiple vaccines or multiple symptoms into multiple rows:


df <- separate_rows(df, VAX_NAMES, sep = ", ")
df <- separate_rows(df, SYMPTOMS, sep = ", ")
df <- df[(df$SYMPTOMS != "NA"), ]

Recode variables

Recode stratification variables and recode age groups:


names(df) <- c("id", "strat_age", "strat_gender", "var1", "var2")
df$strat_gender <- recode(df$strat_gender, F = "Female", M = "Male", U = "Unknown")

idx1 <- which(df$strat_age <= 2)
idx2 <- which(df$strat_age > 2 & df$strat_age <= 18)
idx3 <- which(df$strat_age > 18 & df$strat_age <= 65)
idx4 <- which(df$strat_age > 65)

df$strat_age[idx1] <- "<= 2"
df$strat_age[idx2] <- "2 - 18"
df$strat_age[idx3] <- "18 - 65"
df$strat_age[idx4] <- "> 65"

df$strat_gender <- as.factor(df$strat_gender)
df$strat_age <- as.factor(df$strat_age)

Save to file

Rearrange variable order:


df <- df[, c("id", "var1", "var2", "strat_age", "strat_gender")]

Sanity check:


dim(df)

# [1] 3441506       5

head(df)

# # A tibble: 6 x 5
#   id     var1                var2        strat_age strat_gender
#   <chr>  <chr>               <chr>       <fct>     <fct>       
# 1 025001 DTP (NO BRAND NAME) Agitation   <= 2      Female      
# 2 025003 DTP (TRI-IMMUNOL)   Delirium    <= 2      Male        
# 3 025003 DTP (TRI-IMMUNOL)   Hypokinesia <= 2      Male        
# 4 025003 DTP (TRI-IMMUNOL)   Hypotonia   <= 2      Male        
# 5 025003 POLIO VIRUS         Delirium    <= 2      Male        
# 6 025003 POLIO VIRUS         Hypokinesia <= 2      Male      

Save to file:


saveRDS(df, file = "data-processed/df.rds")

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.