Make a word cloud

Alfonso R. Reyes

2018-02-18

Load 2918 papers metadata

library(petro.One)
library(tm)
library(tibble)

use_example(1)

p1 <- onepetro_page_to_dataframe("1000_conference.html")
p2 <- onepetro_page_to_dataframe("2000_conference.html")
p3 <- onepetro_page_to_dataframe("3000_conference.html")

nn_papers <- rbind(p1, p2, p3)
nn_papers
## # A tibble: 2,918 x 6
##    title_data       paper_id   source   type     year author1_data        
##    <chr>            <chr>      <chr>    <chr>   <int> <chr>               
##  1 Neural Networks~ "        ~ "      ~ "     ~  2002 Russell, Brian, Ham~
##  2 Deconvolution U~ "        ~ "      ~ "     ~  1996 Essenreiter, Robert~
##  3 Neural Network ~ "        ~ "      ~ "     ~  1992 Schmidt, Jumndyr, P~
##  4 Hydrocarbon Pre~ "        ~ "      ~ "     ~  2000 Xiangjun, Zhang, In~
##  5 Higher-Order Ne~ "        ~ "      ~ "     ~  1994 Kumoluyi, A.O., Imp~
##  6 Multiple Attenu~ "        ~ "      ~ "     ~  2000 Karrenbach, M., Uni~
##  7 Conductive frac~ "        ~ "      ~ "     ~  1995 Thomas, Andrew L., ~
##  8 APPLYING NEURAL~ "        ~ "      ~ "     ~  2002 Silva, M., PetrĂ³leo~
##  9 Bit Bounce Dete~ "        ~ "      ~ "     ~  2004 Vassallo, Massimili~
## 10 Artificial Neur~ "        ~ "      ~ "     ~  2014 Lind, Yuliya B., Ba~
## # ... with 2,908 more rows

Convert and clean document for text mining

vdocs <- VCorpus(VectorSource(nn_papers$title_data))
vdocs <- tm_map(vdocs, content_transformer(tolower))      # to lowercase
vdocs <- tm_map(vdocs, removeWords, stopwords("english")) # remove stopwords

Summary table with words frequency

tdm <- TermDocumentMatrix(vdocs)

tdm.matrix <- as.matrix(tdm)
tdm.rs <- sort(rowSums(tdm.matrix), decreasing=TRUE)
tdm.df <- data.frame(word = names(tdm.rs), freq = tdm.rs, stringsAsFactors = FALSE)
as.tibble(tdm.df)                          # prevent long printing of dataframe
## # A tibble: 5,145 x 2
##    word        freq
##  * <chr>      <dbl>
##  1 using        666
##  2 neural       520
##  3 reservoir    499
##  4 data         348
##  5 seismic      291
##  6 network      288
##  7 artificial   283
##  8 analysis     249
##  9 prediction   245
## 10 networks     227
## # ... with 5,135 more rows

There are 5145 words under analysis. We will focus our attention on those papers were the frequency is greater than 50 occurrances.

Word cloud with words that occur at least 50 times

library(wordcloud)

set.seed(1234)
wordcloud(words = tdm.df$word, freq = tdm.df$freq, min.freq = 50,
          max.words=200, random.order=FALSE, rot.per=0.35,
          colors=brewer.pal(8, "Dark2"))

Note that in the word cloud there are words of common use such as using, use, new, approach and case. These words are not necessarily technical enough to improve where the papers we are analyzing are focusing. In the next example, we will build our own custom stopwords to prevent these words from showing.