The function docx_summary()
returns the content of a Word document.
library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
head(content)
## doc_index content_type style_name
## 1 1 paragraph heading 1
## 2 2 paragraph <NA>
## 3 3 paragraph heading 1
## 4 4 paragraph List Paragraph
## 5 5 paragraph List Paragraph
## 6 6 paragraph List Paragraph
## text level num_id
## 1 Title 1 NA NA
## 2 Lorem ipsum dolor sit amet, consectetur adipiscing elit. NA NA
## 3 Title 2 NA NA
## 4 Quisque tristique 1 2
## 5 Augue nisi, et convallis 1 2
## 6 Sapien mollis nec. 1 2
## row_id is_header cell_id col_span row_span
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
Explore the results:
## paragraph table cell
## 17 1
To get all paragraphs:
par_data <- subset(content, content_type %in% "paragraph")
par_data <- par_data[, c("doc_index", "style_name",
"text", "level", "num_id") ]
par_data$text <- with(par_data, {
substr(
text, start = 1,
stop = ifelse(nchar(text)<30, nchar(text), 30) )
})
par_data
## doc_index style_name text level num_id
## 1 1 heading 1 Title 1 NA NA
## 2 2 <NA> Lorem ipsum dolor sit amet, co NA NA
## 3 3 heading 1 Title 2 NA NA
## 4 4 List Paragraph Quisque tristique 1 2
## 5 5 List Paragraph Augue nisi, et convallis 1 2
## 6 6 List Paragraph Sapien mollis nec. 1 2
## 7 7 heading 2 Sub title 1 NA NA
## 8 8 List Paragraph Quisque tristique 1 1
## 9 9 List Paragraph Augue nisi, et convallis 1 1
## 10 10 List Paragraph Sapien mollis nec. 1 1
## 11 11 <NA> NA NA
## 12 12 <NA> Phasellus nec nunc vitae nulla NA NA
## 13 13 heading 2 Sub title 2 NA NA
## 14 14 <NA> Morbi rhoncus sapien sit amet NA NA
## 15 15 <NA> NA NA
## 16 17 <NA> NA NA
## 17 18 <NA> NA NA
Tables are unstacked:
## doc_index content_type style_name text level num_id row_id
## 1.1 16 table cell Light Shading Petals NA NA 1
## 1.11 16 table cell Light Shading 5,621498349 NA NA 2
## 1.12 16 table cell Light Shading 4,994616997 NA NA 3
## 1.13 16 table cell Light Shading 4,767504884 NA NA 4
## 1.14 16 table cell Light Shading 25,9242382 NA NA 5
## 1.15 16 table cell Light Shading 6,489375001 NA NA 6
## is_header cell_id col_span row_span
## 1.1 TRUE 1 1 1
## 1.11 FALSE 1 2 1
## 1.12 FALSE 1 1 1
## 1.13 FALSE 1 1 1
## 1.14 FALSE 1 2 1
## 1.15 FALSE 1 1 1
Cells positions and values are dispatched in columns row_id
, cell_id
, text
and is_header
(a logical column indicating if the cell is part of a header or not). Note that the content itself (column text
) is a character vector.
table_body <- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
head(table_body)
## row_id cell_id text
## 1.11 2 1 5,621498349
## 1.12 3 1 4,994616997
## 1.13 4 1 4,767504884
## 1.14 5 1 25,9242382
## 1.15 6 1 6,489375001
## 1.16 7 1 5,7858682
Reshaping the data with columns row_id
, cell_id
and text
would display something close to the orginal table:
## cell_id
## row_id 1 2 3
## 2 "5,621498349" NA "2,46210657918,2034091"
## 3 "4,994616997" "AA" "2,429320759"
## 4 "4,767504884" NA "AAA"
## 5 "25,9242382" NA "2,066051345"
## 6 "6,489375001" "25,21130805" "2,901582763"
## 7 "5,7858682" "25,52433147" "2,655642742"
## 8 "5,645575295" "Merged cell" "2,278691288"
## 9 "4,828953215" NA "2,238467716"
## 10 "6,783500773" NA "2,202762147"
## 11 "5,395076839" NA "2,538375992"
## 12 "4,683617783" "29,2459239" "2,601945544"
## 13 "Note" NA NA
## cell_id
## row_id 4
## 2 NA
## 3 "17,65204912"
## 4 NA
## 5 "18,37915478"
## 6 "17,3130473717,0721572418,2902189"
## 7 NA
## 8 NA
## 9 "19,87376227"
## 10 "19,85326662"
## 11 "19,56545356"
## 12 "18,95335451"
## 13 NA
Getting headers requires another operation:
data <- subset(table_cells, is_header)
data <- data[, c("row_id", "cell_id", "text") ]
tapply(data$text,
list(row_id = data$row_id,
cell_id = data$cell_id
), FUN = I )
## cell_id
## row_id 1 2 3 4
## 1 "Petals" "Internode" "Sepal" "Bract"
The function pptx_summary()
returns the content of a PowerPoint document.
example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
doc <- read_pptx(example_pptx)
content <- pptx_summary(doc)
head(content)
## text id content_type slide_id row_id cell_id col_span
## 1 Title 12 paragraph 1 NA NA NA
## 2 A table 13 paragraph 1 NA NA NA
## 3 and some text 13 paragraph 1 NA NA NA
## 4 and some list (1) 13 paragraph 1 NA NA NA
## 5 and some list (2) 13 paragraph 1 NA NA NA
## 1.1 Header 1 18 table cell 1 1 1 1
## row_span media_file
## 1 NA <NA>
## 2 NA <NA>
## 3 NA <NA>
## 4 NA <NA>
## 5 NA <NA>
## 1.1 1 <NA>
Explore the results:
## image paragraph table cell
## 1 5 2
To get all paragraphs:
## id text
## 1 12 Title
## 2 13 A table
## 3 13 and some text
## 4 13 and some list (1)
## 5 13 and some list (2)
## 11 15 R logo
To get an image:
image_row <- subset(content, content_type %in% "image")
media_extract(doc, path = image_row$media_file, target = "extract.png")
## [1] TRUE
Tables are unstacked :
## text id content_type slide_id row_id cell_id col_span row_span
## 1.1 Header 1 18 table cell 1 1 1 1 1
## 1.4 A 18 table cell 1 2 1 1 1
## 1.7 B 18 table cell 1 3 1 1 1
## 1.10 B 18 table cell 1 4 1 1 1
## 1.13 C 18 table cell 1 5 1 1 1
## 2.2 Header 2 18 table cell 1 1 2 1 1
## media_file
## 1.1 <NA>
## 1.4 <NA>
## 1.7 <NA>
## 1.10 <NA>
## 1.13 <NA>
## 2.2 <NA>
Cells positions and values are dispatched in columns row_id
, cell_id
, text
. Note that here there is no indicator for the table header.
data <- subset(table_cells, id == 18, c(row_id, cell_id, text) )
tapply(data$text,
list(row_id = data$row_id,
cell_id = data$cell_id
), FUN = I )
## cell_id
## row_id 1 2 3
## 1 "Header 1 " "Header 2" "Header 3"
## 2 "A" "12.23" "blah blah"
## 3 "B" "1.23" "blah blah blah"
## 4 "B" "9.0" "Salut"
## 5 "C" "6" "Hello"