Import Word document

Function docx_summary is returning content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
content
## # A tibble: 69 x 11
##    doc_index content_type     style_name
##        <int>        <chr>          <chr>
##  1         1    paragraph      heading 1
##  2         2    paragraph           <NA>
##  3         3    paragraph      heading 1
##  4         4    paragraph List Paragraph
##  5         5    paragraph List Paragraph
##  6         6    paragraph List Paragraph
##  7         7    paragraph      heading 2
##  8         8    paragraph List Paragraph
##  9         9    paragraph List Paragraph
## 10        10    paragraph List Paragraph
## # ... with 59 more rows, and 8 more variables: text <chr>, level <dbl>,
## #   num_id <int>, row_id <int>, is_header <lgl>, cell_id <dbl>,
## #   col_span <dbl>, row_span <dbl>

Explore the results:

library(dplyr)
content %>% group_by(content_type) %>% summarise(n = n_distinct(doc_index))
## # A tibble: 2 x 2
##   content_type     n
##          <chr> <int>
## 1    paragraph    17
## 2   table cell     1

To get all paragraphs:

par_data <- content %>% filter(content_type %in% "paragraph") %>% 
  select(doc_index, style_name, text, level, num_id) %>% 
  # let's make text shorter so it can be display in that vignette
  mutate(text = substr(text, start = 1, 
                       stop = ifelse(nchar(text)<30, nchar(text), 30) ))

par_data
## # A tibble: 17 x 5
##    doc_index     style_name                           text level num_id
##        <int>          <chr>                          <chr> <dbl>  <int>
##  1         1      heading 1                        Title 1    NA     NA
##  2         2           <NA> Lorem ipsum dolor sit amet, co    NA     NA
##  3         3      heading 1                        Title 2    NA     NA
##  4         4 List Paragraph             Quisque tristique      1      2
##  5         5 List Paragraph      Augue nisi, et convallis      1      2
##  6         6 List Paragraph            Sapien mollis nec.      1      2
##  7         7      heading 2                    Sub title 1    NA     NA
##  8         8 List Paragraph             Quisque tristique      1      1
##  9         9 List Paragraph      Augue nisi, et convallis      1      1
## 10        10 List Paragraph            Sapien mollis nec.      1      1
## 11        11           <NA>                                   NA     NA
## 12        12           <NA> Phasellus nec nunc vitae nulla    NA     NA
## 13        13      heading 2                    Sub title 2    NA     NA
## 14        14           <NA> Morbi rhoncus sapien sit amet     NA     NA
## 15        15           <NA>                                   NA     NA
## 16        17           <NA>                                   NA     NA
## 17        18           <NA>                                   NA     NA

Word tables

Tables are unstacked:

table_cells <- content %>% filter(content_type %in% "table cell")
print(table_cells)
## # A tibble: 52 x 11
##    doc_index content_type    style_name        text level num_id row_id
##        <int>        <chr>         <chr>       <chr> <dbl>  <int>  <int>
##  1        16   table cell Light Shading      Petals    NA     NA      1
##  2        16   table cell Light Shading 5,621498349    NA     NA      2
##  3        16   table cell Light Shading 4,994616997    NA     NA      3
##  4        16   table cell Light Shading 4,767504884    NA     NA      4
##  5        16   table cell Light Shading  25,9242382    NA     NA      5
##  6        16   table cell Light Shading 6,489375001    NA     NA      6
##  7        16   table cell Light Shading   5,7858682    NA     NA      7
##  8        16   table cell Light Shading 5,645575295    NA     NA      8
##  9        16   table cell Light Shading 4,828953215    NA     NA      9
## 10        16   table cell Light Shading 6,783500773    NA     NA     10
## # ... with 42 more rows, and 4 more variables: is_header <lgl>,
## #   cell_id <dbl>, col_span <dbl>, row_span <dbl>

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of header or not). Note that content (column text) is a character vector.

table_body <- table_cells %>% 
  filter(!is_header) %>% 
  select(row_id, cell_id, text)
table_body
## # A tibble: 48 x 3
##    row_id cell_id        text
##     <int>   <dbl>       <chr>
##  1      2       1 5,621498349
##  2      3       1 4,994616997
##  3      4       1 4,767504884
##  4      5       1  25,9242382
##  5      6       1 6,489375001
##  6      7       1   5,7858682
##  7      8       1 5,645575295
##  8      9       1 4,828953215
##  9     10       1 6,783500773
## 10     11       1 5,395076839
## # ... with 38 more rows

Reshape data with columns row_id, cell_id and text, it’s easy to do with tidyr :

if( require("tidyr"))
  table_body %>% spread(cell_id, text)  
## Le chargement a nécessité le package : tidyr
## # A tibble: 12 x 5
##    row_id         `1`         `2`                   `3`
##  *  <int>       <chr>       <chr>                 <chr>
##  1      2 5,621498349        <NA> 2,46210657918,2034091
##  2      3 4,994616997          AA           2,429320759
##  3      4 4,767504884        <NA>                   AAA
##  4      5  25,9242382        <NA>           2,066051345
##  5      6 6,489375001 25,21130805           2,901582763
##  6      7   5,7858682 25,52433147           2,655642742
##  7      8 5,645575295 Merged cell           2,278691288
##  8      9 4,828953215        <NA>           2,238467716
##  9     10 6,783500773        <NA>           2,202762147
## 10     11 5,395076839        <NA>           2,538375992
## 11     12 4,683617783  29,2459239           2,601945544
## 12     13        Note        <NA>                  <NA>
## # ... with 1 more variables: `4` <chr>

Getting headers requires another operation:

if( require("tidyr"))
  table_cells %>% 
    filter(is_header) %>% 
    select(row_id, cell_id, text) %>% 
    spread(cell_id, text)  
## # A tibble: 1 x 5
##   row_id    `1`       `2`   `3`   `4`
## *  <int>  <chr>     <chr> <chr> <chr>
## 1      1 Petals Internode Sepal Bract

Import PowerPoint document

Function pptx_summary is returning content of a PowerPoint document

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
doc <- read_pptx(example_pptx)
content <- pptx_summary(doc)
content
## # A tibble: 55 x 9
##                 text    id content_type slide_id row_id cell_id col_span
##                <chr> <chr>        <chr>    <int>  <int>   <int>    <dbl>
##  1             Title    12    paragraph        1     NA      NA       NA
##  2          A table     13    paragraph        1     NA      NA       NA
##  3     and some text    13    paragraph        1     NA      NA       NA
##  4 and some list (1)    13    paragraph        1     NA      NA       NA
##  5 and some list (2)    13    paragraph        1     NA      NA       NA
##  6         Header 1     18   table cell        1      1       1        1
##  7          Header 2    18   table cell        1      1       2        1
##  8          Header 3    18   table cell        1      1       3        1
##  9                 A    18   table cell        1      2       1        1
## 10             12.23    18   table cell        1      2       2        1
## # ... with 45 more rows, and 2 more variables: row_span <dbl>,
## #   media_file <chr>

Explore the results:

content %>% group_by(content_type) %>% summarise(n = n_distinct(id))
## # A tibble: 3 x 2
##   content_type     n
##          <chr> <int>
## 1        image     1
## 2    paragraph     5
## 3   table cell     2

To get all paragraphs:

par_data <- content %>% filter(content_type %in% "paragraph") %>% 
  select(id, text)

par_data
## # A tibble: 13 x 2
##       id               text
##    <chr>              <chr>
##  1    12              Title
##  2    13           A table 
##  3    13      and some text
##  4    13  and some list (1)
##  5    13  and some list (2)
##  6    15             R logo
##  7     2                 Hi
##  8     3           This is 
##  9     3       an unordered
## 10     3 list of paragraphs
## 11     3                   
## 12     3 This is an ordered
## 13     3 list of paragraphs

To get an image:

image_row <- content %>% filter(content_type %in% "image")
media_extract(doc, path = image_row$media_file, target = "extract.png")
## [1] FALSE

PowerPoint tables

Tables are unstacked :

table_cells <- content %>% filter(content_type %in% "table cell")
table_cells
## # A tibble: 41 x 9
##              text    id content_type slide_id row_id cell_id col_span
##             <chr> <chr>        <chr>    <int>  <int>   <int>    <dbl>
##  1      Header 1     18   table cell        1      1       1        1
##  2       Header 2    18   table cell        1      1       2        1
##  3       Header 3    18   table cell        1      1       3        1
##  4              A    18   table cell        1      2       1        1
##  5          12.23    18   table cell        1      2       2        1
##  6      blah blah    18   table cell        1      2       3        1
##  7              B    18   table cell        1      3       1        1
##  8           1.23    18   table cell        1      3       2        1
##  9 blah blah blah    18   table cell        1      3       3        1
## 10              B    18   table cell        1      4       1        1
## # ... with 31 more rows, and 2 more variables: row_span <dbl>,
## #   media_file <chr>

Cells positions and values are dispatched in columns row_id, cell_id, text. Note here there is no indicator for table header.

if( require("tidyr"))
  table_cells %>% filter( id == 18 ) %>%  
    select(row_id, cell_id, text) %>% 
    spread(cell_id, text)  
## # A tibble: 5 x 4
##   row_id       `1`      `2`            `3`
## *  <int>     <chr>    <chr>          <chr>
## 1      1 Header 1  Header 2       Header 3
## 2      2         A    12.23      blah blah
## 3      3         B     1.23 blah blah blah
## 4      4         B      9.0          Salut
## 5      5         C        6          Hello