Extract content from Word and PowerPoint

Import Word document

The function docx_summary() returns the content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
head(content)
##   doc_index content_type     style_name
## 1         1    paragraph      heading 1
## 2         2    paragraph           <NA>
## 3         3    paragraph      heading 1
## 4         4    paragraph List Paragraph
## 5         5    paragraph List Paragraph
## 6         6    paragraph List Paragraph
##                                                        text level num_id
## 1                                                   Title 1    NA     NA
## 2 Lorem ipsum dolor sit amet, consectetur adipiscing elit.     NA     NA
## 3                                                   Title 2    NA     NA
## 4                                        Quisque tristique      1      2
## 5                                 Augue nisi, et convallis      1      2
## 6                                       Sapien mollis nec.      1      2
##   row_id is_header cell_id col_span row_span
## 1     NA        NA      NA       NA       NA
## 2     NA        NA      NA       NA       NA
## 3     NA        NA      NA       NA       NA
## 4     NA        NA      NA       NA       NA
## 5     NA        NA      NA       NA       NA
## 6     NA        NA      NA       NA       NA

Explore the results:

tapply(content$doc_index, 
       content$content_type, 
       function(x) length(unique(x)))
##  paragraph table cell 
##         17          1

To get all paragraphs:

par_data <- subset(content, content_type %in% "paragraph") 
par_data <- par_data[, c("doc_index", "style_name", 
                         "text", "level", "num_id") ]
par_data$text <- with(par_data, {
  substr(
    text, start = 1, 
    stop = ifelse(nchar(text)<30, nchar(text), 30) )
})
par_data
##    doc_index     style_name                           text level num_id
## 1          1      heading 1                        Title 1    NA     NA
## 2          2           <NA> Lorem ipsum dolor sit amet, co    NA     NA
## 3          3      heading 1                        Title 2    NA     NA
## 4          4 List Paragraph             Quisque tristique      1      2
## 5          5 List Paragraph      Augue nisi, et convallis      1      2
## 6          6 List Paragraph            Sapien mollis nec.      1      2
## 7          7      heading 2                    Sub title 1    NA     NA
## 8          8 List Paragraph             Quisque tristique      1      1
## 9          9 List Paragraph      Augue nisi, et convallis      1      1
## 10        10 List Paragraph            Sapien mollis nec.      1      1
## 11        11           <NA>                                   NA     NA
## 12        12           <NA> Phasellus nec nunc vitae nulla    NA     NA
## 13        13      heading 2                    Sub title 2    NA     NA
## 14        14           <NA> Morbi rhoncus sapien sit amet     NA     NA
## 15        15           <NA>                                   NA     NA
## 16        17           <NA>                                   NA     NA
## 17        18           <NA>                                   NA     NA

Word tables

Tables are unstacked:

##      doc_index content_type    style_name        text level num_id row_id
## 1.1         16   table cell Light Shading      Petals    NA     NA      1
## 1.11        16   table cell Light Shading 5,621498349    NA     NA      2
## 1.12        16   table cell Light Shading 4,994616997    NA     NA      3
## 1.13        16   table cell Light Shading 4,767504884    NA     NA      4
## 1.14        16   table cell Light Shading  25,9242382    NA     NA      5
## 1.15        16   table cell Light Shading 6,489375001    NA     NA      6
##      is_header cell_id col_span row_span
## 1.1       TRUE       1        1        1
## 1.11     FALSE       1        2        1
## 1.12     FALSE       1        1        1
## 1.13     FALSE       1        1        1
## 1.14     FALSE       1        2        1
## 1.15     FALSE       1        1        1

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of a header or not). Note that the content itself (column text) is a character vector.

##      row_id cell_id        text
## 1.11      2       1 5,621498349
## 1.12      3       1 4,994616997
## 1.13      4       1 4,767504884
## 1.14      5       1  25,9242382
## 1.15      6       1 6,489375001
## 1.16      7       1   5,7858682

Reshaping the data with columns row_id, cell_id and text would display something close to the orginal table:

##       cell_id
## row_id 1             2             3                      
##     2  "5,621498349" NA            "2,46210657918,2034091"
##     3  "4,994616997" "AA"          "2,429320759"          
##     4  "4,767504884" NA            "AAA"                  
##     5  "25,9242382"  NA            "2,066051345"          
##     6  "6,489375001" "25,21130805" "2,901582763"          
##     7  "5,7858682"   "25,52433147" "2,655642742"          
##     8  "5,645575295" "Merged cell" "2,278691288"          
##     9  "4,828953215" NA            "2,238467716"          
##     10 "6,783500773" NA            "2,202762147"          
##     11 "5,395076839" NA            "2,538375992"          
##     12 "4,683617783" "29,2459239"  "2,601945544"          
##     13 "Note"        NA            NA                     
##       cell_id
## row_id 4                                 
##     2  NA                                
##     3  "17,65204912"                     
##     4  NA                                
##     5  "18,37915478"                     
##     6  "17,3130473717,0721572418,2902189"
##     7  NA                                
##     8  NA                                
##     9  "19,87376227"                     
##     10 "19,85326662"                     
##     11 "19,56545356"                     
##     12 "18,95335451"                     
##     13 NA

Getting headers requires another operation:

##       cell_id
## row_id 1        2           3       4      
##      1 "Petals" "Internode" "Sepal" "Bract"

Import PowerPoint document

The function pptx_summary() returns the content of a PowerPoint document.

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
doc <- read_pptx(example_pptx)
content <- pptx_summary(doc)
head(content)
##                  text id content_type slide_id row_id cell_id col_span
## 1               Title 12    paragraph        1     NA      NA       NA
## 2            A table  13    paragraph        1     NA      NA       NA
## 3       and some text 13    paragraph        1     NA      NA       NA
## 4   and some list (1) 13    paragraph        1     NA      NA       NA
## 5   and some list (2) 13    paragraph        1     NA      NA       NA
## 1.1         Header 1  18   table cell        1      1       1        1
##     row_span media_file
## 1         NA       <NA>
## 2         NA       <NA>
## 3         NA       <NA>
## 4         NA       <NA>
## 5         NA       <NA>
## 1.1        1       <NA>

Explore the results:

tapply(content$id, 
       content$content_type, 
       function(x) length(unique(x)))
##      image  paragraph table cell 
##          1          5          2

To get all paragraphs:

par_data <- subset(content, 
                   content_type %in% "paragraph", 
                   select = c(id, text) )
head(par_data)
##    id              text
## 1  12             Title
## 2  13          A table 
## 3  13     and some text
## 4  13 and some list (1)
## 5  13 and some list (2)
## 11 15            R logo

To get an image:

image_row <- subset(content, content_type %in% "image")
media_extract(doc, path = image_row$media_file, target = "extract.png")
## [1] TRUE

PowerPoint tables

Tables are unstacked :

##           text id content_type slide_id row_id cell_id col_span row_span
## 1.1  Header 1  18   table cell        1      1       1        1        1
## 1.4          A 18   table cell        1      2       1        1        1
## 1.7          B 18   table cell        1      3       1        1        1
## 1.10         B 18   table cell        1      4       1        1        1
## 1.13         C 18   table cell        1      5       1        1        1
## 2.2   Header 2 18   table cell        1      1       2        1        1
##      media_file
## 1.1        <NA>
## 1.4        <NA>
## 1.7        <NA>
## 1.10       <NA>
## 1.13       <NA>
## 2.2        <NA>

Cells positions and values are dispatched in columns row_id, cell_id, text. Note that here there is no indicator for the table header.

##       cell_id
## row_id 1           2          3               
##      1 "Header 1 " "Header 2" "Header 3"      
##      2 "A"         "12.23"    "blah blah"     
##      3 "B"         "1.23"     "blah blah blah"
##      4 "B"         "9.0"      "Salut"         
##      5 "C"         "6"        "Hello"