Extract content from Word and PowerPoint

Import Word document

Function docx_summary is returning content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
content
## # A tibble: 69 × 11
##    doc_index content_type     style_name
##        <int>        <chr>          <chr>
## 1          1    paragraph      heading 1
## 2          2    paragraph           <NA>
## 3          3    paragraph      heading 1
## 4          4    paragraph List Paragraph
## 5          5    paragraph List Paragraph
## 6          6    paragraph List Paragraph
## 7          7    paragraph      heading 2
## 8          8    paragraph List Paragraph
## 9          9    paragraph List Paragraph
## 10        10    paragraph List Paragraph
## # ... with 59 more rows, and 8 more variables: text <chr>, level <dbl>,
## #   num_id <int>, row_id <int>, is_header <lgl>, cell_id <dbl>,
## #   col_span <dbl>, row_span <dbl>

Explore the results:

library(dplyr)
content %>% group_by(content_type) %>% summarise(n = n_distinct(doc_index))
## # A tibble: 2 × 2
##   content_type     n
##          <chr> <int>
## 1    paragraph    17
## 2   table cell     1

To get all paragraphs:

par_data <- content %>% filter(content_type %in% "paragraph") %>% 
  select(doc_index, style_name, text, level, num_id) %>% 
  # let's make text shorter so it can be display in that vignette
  mutate(text = substr(text, start = 1, 
                       stop = ifelse(nchar(text)<30, nchar(text), 30) ))

par_data
## # A tibble: 17 × 5
##    doc_index     style_name                           text level num_id
##        <int>          <chr>                          <chr> <dbl>  <int>
## 1          1      heading 1                        Title 1    NA     NA
## 2          2           <NA> Lorem ipsum dolor sit amet, co    NA     NA
## 3          3      heading 1                        Title 2    NA     NA
## 4          4 List Paragraph             Quisque tristique      1      2
## 5          5 List Paragraph      Augue nisi, et convallis      1      2
## 6          6 List Paragraph            Sapien mollis nec.      1      2
## 7          7      heading 2                    Sub title 1    NA     NA
## 8          8 List Paragraph             Quisque tristique      1      1
## 9          9 List Paragraph      Augue nisi, et convallis      1      1
## 10        10 List Paragraph            Sapien mollis nec.      1      1
## 11        11           <NA>                                   NA     NA
## 12        12           <NA> Phasellus nec nunc vitae nulla    NA     NA
## 13        13      heading 2                    Sub title 2    NA     NA
## 14        14           <NA> Morbi rhoncus sapien sit amet     NA     NA
## 15        15           <NA>                                   NA     NA
## 16        17           <NA>                                   NA     NA
## 17        18           <NA>                                   NA     NA

Word tables

Tables are unstacked:

table_cells <- content %>% filter(content_type %in% "table cell")
print(table_cells)
## # A tibble: 52 × 11
##    doc_index content_type    style_name        text level num_id row_id
##        <int>        <chr>         <chr>       <chr> <dbl>  <int>  <int>
## 1         16   table cell Light Shading      Petals    NA     NA      1
## 2         16   table cell Light Shading 5,621498349    NA     NA      2
## 3         16   table cell Light Shading 4,994616997    NA     NA      3
## 4         16   table cell Light Shading 4,767504884    NA     NA      4
## 5         16   table cell Light Shading 6,299445616    NA     NA      5
## 6         16   table cell Light Shading 6,489375001    NA     NA      6
## 7         16   table cell Light Shading   5,7858682    NA     NA      7
## 8         16   table cell Light Shading 5,645575295    NA     NA      8
## 9         16   table cell Light Shading 4,828953215    NA     NA      9
## 10        16   table cell Light Shading 6,783500773    NA     NA     10
## # ... with 42 more rows, and 4 more variables: is_header <lgl>,
## #   cell_id <dbl>, col_span <dbl>, row_span <dbl>

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of header or not). Note that content (column text) is a character vector.

table_body <- table_cells %>% 
  filter(!is_header) %>% 
  select(row_id, cell_id, text)
table_body
## # A tibble: 48 × 3
##    row_id cell_id        text
##     <int>   <dbl>       <chr>
## 1       2       1 5,621498349
## 2       3       1 4,994616997
## 3       4       1 4,767504884
## 4       5       1 6,299445616
## 5       6       1 6,489375001
## 6       7       1   5,7858682
## 7       8       1 5,645575295
## 8       9       1 4,828953215
## 9      10       1 6,783500773
## 10     11       1 5,395076839
## # ... with 38 more rows

Reshape data with columns row_id, cell_id and text, it’s easy to do with tidyr :

if( require("tidyr"))
  table_body %>% spread(cell_id, text)  
## Loading required package: tidyr
## # A tibble: 12 × 5
##    row_id         `1`         `2`         `3`         `4`
## *   <int>       <chr>       <chr>       <chr>       <chr>
## 1       2 5,621498349 29,48059578 2,462106579  18,2034091
## 2       3 4,994616997 28,36024706 2,429320759 17,65204912
## 3       4 4,767504884 27,25431792         AAA        <NA>
## 4       5 6,299445616  25,9242382 2,066051345 18,37915478
## 5       6 6,489375001 25,21130805 2,901582763 17,31304737
## 6       7   5,7858682 25,52433147 2,655642742 17,07215724
## 7       8 5,645575295 Merged cell 2,278691288  18,2902189
## 8       9 4,828953215        <NA> 2,238467716 19,87376227
## 9      10 6,783500773        <NA> 2,202762147 19,85326662
## 10     11 5,395076839        <NA> 2,538375992 19,56545356
## 11     12 4,683617783  29,2459239 2,601945544 18,95335451
## 12     13 4,444226444 25,48653519 2,918513379 19,52866206

Getting headers requires another operation:

if( require("tidyr"))
  table_cells %>% 
    filter(is_header) %>% 
    select(row_id, cell_id, text) %>% 
    spread(cell_id, text)  
## # A tibble: 1 × 5
##   row_id    `1`       `2`   `3`   `4`
## *  <int>  <chr>     <chr> <chr> <chr>
## 1      1 Petals Internode Sepal Bract

Import PowerPoint document

Function pptx_summary is returning content of a PowerPoint document

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
doc <- read_pptx(example_pptx)
content <- pptx_summary(doc)
content
## # A tibble: 29 × 9
##                 text    id content_type slide_id row_id cell_id col_span
##                <chr> <chr>        <chr>    <int>  <int>   <int>    <dbl>
## 1              Title    12    paragraph        1     NA      NA       NA
## 2           A table     13    paragraph        1     NA      NA       NA
## 3      and some text    13    paragraph        1     NA      NA       NA
## 4  and some list (1)    13    paragraph        1     NA      NA       NA
## 5  and some list (2)    13    paragraph        1     NA      NA       NA
## 6          Header 1     18   table cell        1      1       1        1
## 7           Header 2    18   table cell        1      1       2        1
## 8           Header 3    18   table cell        1      1       3        1
## 9                  A    18   table cell        1      2       1        1
## 10             12.23    18   table cell        1      2       2        1
## # ... with 19 more rows, and 2 more variables: row_span <dbl>,
## #   media_file <chr>

Explore the results:

content %>% group_by(content_type) %>% summarise(n = n_distinct(id))
## # A tibble: 3 × 2
##   content_type     n
##          <chr> <int>
## 1        image     1
## 2    paragraph     5
## 3   table cell     1

To get all paragraphs:

par_data <- content %>% filter(content_type %in% "paragraph") %>% 
  select(id, text)

par_data
## # A tibble: 13 × 2
##       id               text
##    <chr>              <chr>
## 1     12              Title
## 2     13           A table 
## 3     13      and some text
## 4     13  and some list (1)
## 5     13  and some list (2)
## 6     15             R logo
## 7      2                 Hi
## 8      3           This is 
## 9      3       an unordered
## 10     3 list of paragraphs
## 11     3                   
## 12     3 This is an ordered
## 13     3 list of paragraphs

To get an image:

image_row <- content %>% filter(content_type %in% "image")
media_extract(doc, path = image_row$media_file, target = "extract.png")
## [1] FALSE

PowerPoint tables

Tables are unstacked :

table_cells <- content %>% filter(content_type %in% "table cell")
table_cells
## # A tibble: 15 × 9
##              text    id content_type slide_id row_id cell_id col_span
##             <chr> <chr>        <chr>    <int>  <int>   <int>    <dbl>
## 1       Header 1     18   table cell        1      1       1        1
## 2        Header 2    18   table cell        1      1       2        1
## 3        Header 3    18   table cell        1      1       3        1
## 4               A    18   table cell        1      2       1        1
## 5           12.23    18   table cell        1      2       2        1
## 6       blah blah    18   table cell        1      2       3        1
## 7               B    18   table cell        1      3       1        1
## 8            1.23    18   table cell        1      3       2        1
## 9  blah blah blah    18   table cell        1      3       3        1
## 10              B    18   table cell        1      4       1        1
## 11            9.0    18   table cell        1      4       2        1
## 12          Salut    18   table cell        1      4       3        1
## 13              C    18   table cell        1      5       1        1
## 14              6    18   table cell        1      5       2        1
## 15          Hello    18   table cell        1      5       3        1
## # ... with 2 more variables: row_span <dbl>, media_file <chr>

Cells positions and values are dispatched in columns row_id, cell_id, text. Note here there is no indicator for table header.

if( require("tidyr"))
  table_cells %>% 
    select(row_id, cell_id, text) %>% 
    spread(cell_id, text)  
## # A tibble: 5 × 4
##   row_id       `1`      `2`            `3`
## *  <int>     <chr>    <chr>          <chr>
## 1      1 Header 1  Header 2       Header 3
## 2      2         A    12.23      blah blah
## 3      3         B     1.23 blah blah blah
## 4      4         B      9.0          Salut
## 5      5         C        6          Hello