Skip to content

Commit 80dca9f

Browse files
author
Tyler Rinker
committed
* unnest_text added to located and unnest nested text columns in a data.frame.
1 parent 70d7d31 commit 80dca9f

15 files changed

Lines changed: 356 additions & 135 deletions

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
Package: textshape
2-
Date: 2018-02-11
32
Title: Tools for Reshaping Text
4-
Version: 1.5.2
3+
Version: 1.5.3
54
Authors@R: c(person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")), person("Joran",
65
"Elias", role = "ctb"), person("Matthew", "Flickinger", role = "ctb"), person('Paul', 'Foster', role =
76
"ctb"))
@@ -49,3 +48,4 @@ Collate:
4948
'tidy_table.R'
5049
'tidy_vector.R'
5150
'unique_pairs.R'
51+
'unnest_text.R'

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,6 @@ export(tidy_table)
6565
export(tidy_tdm)
6666
export(tidy_vector)
6767
export(unique_pairs)
68+
export(unnest_text)
6869
importFrom(data.table,":=")
6970
importFrom(data.table,.N)

NEWS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ NEW FEATURES
3131
using the concatenated list/atomic vector names as the names of the single
3232
tiered list.
3333

34+
* `unnest_text` added to located and unnest nested text columns in a data.frame.
3435

3536
MINOR FEATURES
3637

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ textshape 1.5.1 -
3131
using the concatenated list/atomic vector names as the names of the single
3232
tiered list.
3333

34+
* `unnest_text` added to located and unnest nested text columns in a data.frame.
3435

3536
**MINOR FEATURES**
3637

R/unnest_text.R

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#' Unnest Nested Text Columns
2+
#'
3+
#' Unnest nested text columns in a data.frame. Attempts to locate the nested
4+
#' text column without specifying.
5+
#'
6+
#' @param dataframe A dataframe object.
7+
#' @param column Column name to search for markers/terms.
8+
#' @param integer.rownames logical. If \code{TRUE} then the rownames are numbered
9+
#' 1 through number of rows, otherwise the original row number is retained
10+
#' follwed by a period and the element number from the list.
11+
#' @param \ldots ignored.
12+
#' @return Returns an unnested data.frame.
13+
#' @export
14+
#' @examples
15+
#' dat <- DATA
16+
#'
17+
#' ## Add a nested/list text column
18+
#' dat$split <- lapply(dat$state, function(x) {
19+
#' unlist(strsplit(x, '(?<=[?!.])\\s+', perl = TRUE))
20+
#' })
21+
#'
22+
#' unnest_text(dat)
23+
#' unnest_text(dat, integer.rownames = FALSE)
24+
#'
25+
#' ## Add a second nested integer column
26+
#' dat$d <- lapply(dat$split, nchar)
27+
#' \dontrun{
28+
#' unnest_text(dat) # causes error, must supply column explicitly
29+
#' }
30+
#' unnest_text(dat, 'split')
31+
#'
32+
#' ## As a data.table
33+
#' library(data.table)
34+
#' dt_dat <- data.table::as.data.table(data.table::copy(dat))
35+
#' unnest_text(dt_dat, 'split')
36+
#' \dontrun{
37+
#' unnest_text(dt_dat, 'd')
38+
#' }
39+
#'
40+
#' \dontrun{
41+
#' ## As a tibble
42+
#' library(tibble)
43+
#' t_dat <- tibble:::as_tibble(dat)
44+
#' unnest_text(t_dat, 'split')
45+
#' }
46+
unnest_text <- function(dataframe, column, integer.rownames = TRUE, ...){
47+
48+
if (missing(column)) {
49+
column <- names(dataframe)[!unlist(lapply(as.data.frame(dataframe), is.atomic))]
50+
if (length(column) == 0) stop("There appears to be no nested columns. Please supply `column` explicitly.")
51+
if (length(column) > 1) stop("There appears to be multiple nested columns. Please supply `column` explicitly.")
52+
message(sprintf('Nested column detected, unnesting: %s', column))
53+
}
54+
55+
nms <- colnames(dataframe)
56+
57+
lens <- lengths(dataframe[[column]])
58+
col <- unlist(dataframe[[column]])
59+
60+
if (!is.character(col)) {
61+
warning(sprintf(paste0('Unnesting: `%s`\nThis is not a character column.\n\n',
62+
'Perhaps you want to use `tidyr::unnest` instead?'), column), call. = FALSE)
63+
}
64+
65+
dataframe[[column]] <- NA
66+
67+
dataframe <- dataframe[rep(seq_len(nrow(dataframe)), lens),]
68+
69+
dataframe[[column]] <- col
70+
if (isTRUE(integer.rownames)) {
71+
rownames(dataframe) <- NULL
72+
} else {
73+
rnms <- rownames(dataframe)
74+
rnms <- ifelse(grepl('\\.', rnms), rnms, paste0(rnms, '.0'))
75+
76+
rownames(dataframe) <- paste0(
77+
gsub('\\.+$', '', rnms),
78+
'.',
79+
as.integer(gsub('^\\d+\\.', '', rnms)) + 1
80+
)
81+
}
82+
83+
dataframe
84+
85+
}
86+

README.Rmd

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ Most of the functions split, expand, or tidy a `vector`, `list`, `data.frame`, o
5959
| `from_to` | `vector`, `data.frame` | Prepare speaker data for a flow network |
6060
| `mtabulate` | `vector`, `list`, `data.frame` | Dataframe/list version of `tabulate` to produce count matrix |
6161
| `flatten` | `list` | Flatten nested, named list to single tier |
62+
| `unnest_text` | `data.frame` | Unnest a nested text column |
6263
| `split_index` | `vector`, `list`, `data.frame` | Split at specified indices |
6364
| `split_match` | `vector` | Split vector at specified character/regex match |
6465
| `split_portion` | `vector`\* | Split data into portioned chunks |
@@ -237,10 +238,15 @@ The `tidy_colo_dtm` and `tidy_colo_tdm` functions convert a `DocumentTermMatrix`
237238

238239
```{r}
239240
my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
241+
sw <- unique(c(
242+
lexicon::sw_jockers,
243+
lexicon::sw_loughran_mcdonald_long,
244+
lexicon::sw_fry_1000
245+
))
240246
241247
tidy_colo_dtm(my_dtm) %>%
242248
tbl_df() %>%
243-
filter(!term_1 %in% c('i', lexicon::sw_onix) & !term_2 %in% lexicon::sw_onix) %>%
249+
filter(!term_1 %in% c('i', sw) & !term_2 %in% sw) %>%
244250
filter(term_1 != term_2) %>%
245251
unique_pairs() %>%
246252
filter(n > 15) %>%

0 commit comments

Comments
 (0)