|
| 1 | +#' Unnest Nested Text Columns |
| 2 | +#' |
| 3 | +#' Unnest nested text columns in a data.frame. Attempts to locate the nested |
| 4 | +#' text column without specifying. |
| 5 | +#' |
| 6 | +#' @param dataframe A dataframe object. |
| 7 | +#' @param column Column name to search for markers/terms. |
| 8 | +#' @param integer.rownames logical. If \code{TRUE} then the rownames are numbered |
| 9 | +#' 1 through number of rows, otherwise the original row number is retained |
| 10 | +#' follwed by a period and the element number from the list. |
| 11 | +#' @param \ldots ignored. |
| 12 | +#' @return Returns an unnested data.frame. |
| 13 | +#' @export |
| 14 | +#' @examples |
| 15 | +#' dat <- DATA |
| 16 | +#' |
| 17 | +#' ## Add a nested/list text column |
| 18 | +#' dat$split <- lapply(dat$state, function(x) { |
| 19 | +#' unlist(strsplit(x, '(?<=[?!.])\\s+', perl = TRUE)) |
| 20 | +#' }) |
| 21 | +#' |
| 22 | +#' unnest_text(dat) |
| 23 | +#' unnest_text(dat, integer.rownames = FALSE) |
| 24 | +#' |
| 25 | +#' ## Add a second nested integer column |
| 26 | +#' dat$d <- lapply(dat$split, nchar) |
| 27 | +#' \dontrun{ |
| 28 | +#' unnest_text(dat) # causes error, must supply column explicitly |
| 29 | +#' } |
| 30 | +#' unnest_text(dat, 'split') |
| 31 | +#' |
| 32 | +#' ## As a data.table |
| 33 | +#' library(data.table) |
| 34 | +#' dt_dat <- data.table::as.data.table(data.table::copy(dat)) |
| 35 | +#' unnest_text(dt_dat, 'split') |
| 36 | +#' \dontrun{ |
| 37 | +#' unnest_text(dt_dat, 'd') |
| 38 | +#' } |
| 39 | +#' |
| 40 | +#' \dontrun{ |
| 41 | +#' ## As a tibble |
| 42 | +#' library(tibble) |
| 43 | +#' t_dat <- tibble:::as_tibble(dat) |
| 44 | +#' unnest_text(t_dat, 'split') |
| 45 | +#' } |
| 46 | +unnest_text <- function(dataframe, column, integer.rownames = TRUE, ...){ |
| 47 | + |
| 48 | + if (missing(column)) { |
| 49 | + column <- names(dataframe)[!unlist(lapply(as.data.frame(dataframe), is.atomic))] |
| 50 | + if (length(column) == 0) stop("There appears to be no nested columns. Please supply `column` explicitly.") |
| 51 | + if (length(column) > 1) stop("There appears to be multiple nested columns. Please supply `column` explicitly.") |
| 52 | + message(sprintf('Nested column detected, unnesting: %s', column)) |
| 53 | + } |
| 54 | + |
| 55 | + nms <- colnames(dataframe) |
| 56 | + |
| 57 | + lens <- lengths(dataframe[[column]]) |
| 58 | + col <- unlist(dataframe[[column]]) |
| 59 | + |
| 60 | + if (!is.character(col)) { |
| 61 | + warning(sprintf(paste0('Unnesting: `%s`\nThis is not a character column.\n\n', |
| 62 | + 'Perhaps you want to use `tidyr::unnest` instead?'), column), call. = FALSE) |
| 63 | + } |
| 64 | + |
| 65 | + dataframe[[column]] <- NA |
| 66 | + |
| 67 | + dataframe <- dataframe[rep(seq_len(nrow(dataframe)), lens),] |
| 68 | + |
| 69 | + dataframe[[column]] <- col |
| 70 | + if (isTRUE(integer.rownames)) { |
| 71 | + rownames(dataframe) <- NULL |
| 72 | + } else { |
| 73 | + rnms <- rownames(dataframe) |
| 74 | + rnms <- ifelse(grepl('\\.', rnms), rnms, paste0(rnms, '.0')) |
| 75 | + |
| 76 | + rownames(dataframe) <- paste0( |
| 77 | + gsub('\\.+$', '', rnms), |
| 78 | + '.', |
| 79 | + as.integer(gsub('^\\d+\\.', '', rnms)) + 1 |
| 80 | + ) |
| 81 | + } |
| 82 | + |
| 83 | + dataframe |
| 84 | + |
| 85 | +} |
| 86 | + |
0 commit comments