Text classification using sparse matrices, bag of words, TF-IDF, and penalized logistic regression

Danny Morris

2019/02/17

This post demonstrates a strategy for text classification (binary) using efficient data representation, transformation, and modeling techniques. Using the text2vec package, raw text is tokenized, converted to sparse bag-of-words feature matrix, and weighted using TF-IDF. The weighted bag-of-words feature matrix is used as an input to an efficient penalized logistic regression algorithm implemented in the glmnet pacakge. This pipeline is implemented along with repeated cross validation for robust model evaluation.

Packages

library(text2vec)     # NLP tools
library(tidytext)     # tidy text mining
library(glmnet)       # logistic regression
library(tidymodels)   # modeling
library(tidyverse)    # general purpose data manipulation
library(textstem)     # word lemmatization

Data

text_df <- read_csv("text_clf.csv") %>%
  sample_frac(0.1) %>%
  mutate(doc_id = row_number())

Model function

# Arguments: training and testing data
# Returns: AUC of out-of-sample (testing) predictions
model_function <- function(train, test) {
  
  model_dfs <- list(train = train, test = test)
  
  ############
  # Tokenize #
  ############
  
  tokens <- model_dfs %>%
    map(., function(x) {
      x %>%
        tidytext::unnest_tokens(word, Text) %>%
        ungroup() %>%
        # remove stopwords
        anti_join(stop_words, by = "word") %>%
        # lemmatize
        mutate(word = textstem::lemmatize_words(word)) %>%
        mutate(word = str_replace_all(word, "[^[:alpha:]]", " ")) %>% 
        mutate(word = str_replace(gsub("\\s+", " ", str_trim(word)), "B", "b")) %>%
        # obtain the vector of words (tokens) for each training and testing document
        split(.$doc_id) %>%
        map(., function(x) x %>% pull(word))
    })
  
  ########################################
  # Create vocabulary from training data #
  ########################################
  
  iter_train <- text2vec::itoken(iterable = tokens$train, 
                                 ids = model_dfs$train$doc_id,
                                 progressbar = FALSE)
  
  iter_test <- text2vec::itoken(iterable = tokens$test, 
                                ids = model_dfs$test$doc_id,
                                progressbar = FALSE)
  
  vocab <- text2vec::create_vocabulary(iter_train)
  
  vectorizer <- text2vec::vocab_vectorizer(vocab)
  
  ##########################
  # Document-term matrices #
  ##########################
  
  doc_term_train = text2vec::create_dtm(iter_train, vectorizer)
  doc_term_test = create_dtm(iter_test, vectorizer)
  
  ##########
  # TF-IDF #
  ##########
  
  tf_idf = TfIdf$new()
  
  # fit tf-idf to training data
  doc_term_train_tfidf = fit_transform(doc_term_train, tf_idf)
  
  # apply pre-trained tf-idf transformation to testing data
  doc_term_test_tfidf  = transform(doc_term_test, tf_idf)
  
  ####################
  # Train classifier #
  ####################
  
  glmnet_classifier = cv.glmnet(
    x = doc_term_train_tfidf, 
    y = model_dfs$train$Label,
    family = 'binomial', 
    alpha = 1,
    type.measure = "auc",
    nfolds = 5,
    thresh = 1e-3,
    maxit = 1e3)
  
  probs = predict(glmnet_classifier, doc_term_test_tfidf, type = 'response')[, 1]
  
  ###########################
  # Measure AUC on test set #
  ###########################
  
  test_auc = glmnet:::auc(
    y = model_dfs$test$Label %>% as.integer(),
    prob = probs
  )
  
  return(test_auc)
  
}

Implement cross validation

Using purrr::map(), run the model function within each cross validation fold.

n_folds <- 5
n_repeats <- 3

cv_splits <- rsample::vfold_cv(text_df, 
                               v = n_folds, 
                               repeats = n_repeats, 
                               strata = 'Label')

cv_models <- map_dbl(cv_splits$splits, function(cv) {
  train <- rsample::training(cv)
  test <- rsample::testing(cv)
  model_function(train = train, test = test)
})

Evaluate CV results

Evaluate performance of the classifier by looking at the distribution of AUC across all cross validation folds.

cv_models %>%
  enframe() %>%
  ggplot(aes(x = value)) +
  geom_density() +
  labs(title = "Distribution of AUC across all cross validation folds",
       subtitle = paste("N models = ", length(cv_splits$splits)),
       x = "AUC") +
  scale_x_continuous(breaks = scales::pretty_breaks(n=10)) +
  theme_bw()