Text Mining of Permanent Downhole Gauges

Vignette Author

2018-02-18

library(petro.One)
library(tm)

my_url <- make_search_url(query = "Permanent Downhole Gauge", 
                          how = "all")        

get_papers_count(my_url)    # how many papers total
## [1] 552
papers_by_type(my_url)      # papers by type
## # A tibble: 3 x 2
##   name              value
##   <chr>             <dbl>
## 1 Conference paper 471   
## 2 Journal paper     77.0 
## 3 Presentation       4.00
# create a dataframe of papers found
df <- read_multidoc(my_url)
df
## # A tibble: 552 x 6
##    title_data           paper_id   source  type     year author1_data     
##    <chr>                <chr>      <chr>   <chr>   <int> <chr>            
##  1 Wavelet Filtering o~ "        ~ "     ~ "     ~  2009 Pico, Carlos,    
##  2 Encouraging Experie~ "        ~ "     ~ "     ~  2009 Igbokoyi, A.O., ~
##  3 Permanent Downhole ~ "        ~ "     ~ "     ~  2009 Horng, Chen Jiun~
##  4 Pressure Transient ~ "        ~ "     ~ "     ~  2013 Al-hashim, Hasan~
##  5 Reservoir Managemen~ "        ~ "     ~ "     ~  2004 de Oliveira Silv~
##  6 Comparative Analysi~ "        ~ "     ~ "     ~  2014 Enyekwe, A.E., U~
##  7 Analyzing Transient~ "        ~ "     ~ "     ~  2007 Zheng, Shiyi, He~
##  8 Permanent Downhole ~ "        ~ "     ~ "     ~  2014 Pham, Hoanh Van,~
##  9 Wireless Retrofit S~ "        ~ "     ~ "     ~  2014 Green, Annabel, ~
## 10 Permanent Downhole ~ "        ~ "     ~ "     ~  1992 Bezerra, M.F.C.,~
## # ... with 542 more rows
library(petro.One)

term_freq <- term_frequency(df)
term_freq
## # A tibble: 1,514 x 2
##    word        freq
##    <chr>      <int>
##  1 reservoir    127
##  2 well         118
##  3 data          99
##  4 field         85
##  5 pressure      85
##  6 production    84
##  7 downhole      83
##  8 permanent     70
##  9 gas           69
## 10 analysis      62
## # ... with 1,504 more rows
library(petro.One)

plot_wordcloud(df, max.words = 100, min.freq = 15)

Bar plot

plot_bars(df, min.freq = 25)

dendogram

plot_relationships(df, min.freq = 25, threshold = 0.1)

library(cluster)   
tdm <- get_term_document_matrix(df)$tdm

tdm.rst <- removeSparseTerms(tdm, 0.93)

d <- dist(tdm.rst, method="euclidian")   
fit <- hclust(d=d, method="complete")   # for a different look try substituting: method="ward.D"
fit 
## 
## Call:
## hclust(d = d, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 16
plot(fit, hang = 1)