Thousands of Papers to Dataframe

Alfonso R. Reyes

2018-02-18

The maximum number of rows that a OnePetro query can return is 1000. It means that the user could set up the query to return up to amximum of 1000 papers. Abover that number, the query to OnePetro will return error.

OnePetro has options to define the number of rows to display at 10, 50 and 100 rows. Additionally, through scripts like these, that number could be raised up to 1,000.

This article describes the process of reading multiple pages with thousand of papers to a unique dataframe.

Retrieve the most numerous paper by type

library(petro.One)

What type of paper do we have?

papers_by_type(my_url)
## # A tibble: 7 x 2
##   name               value
##   <chr>              <dbl>
## 1 Chapter             1.00
## 2 Conference paper 3126   
## 3 General            60.0 
## 4 Journal paper     905   
## 5 Media               5.00
## 6 Other               1.00
## 7 Presentation        7.00

For the tyme being we will retrieve only conference papers.

Collect first 1000 rows

# we use "conference-paper" only because other document types have
# different dataframe structure

my_url_1 <- make_search_url(query = "pressure transient analysis", 
                          how = "all", 
                          dc_type = "conference-paper",
                          start = 0,
                          rows  = 1000)

get_papers_count(my_url_1)
## [1] 3126
page_1 <- read_onepetro(my_url_1)
htm_1 <- "pta-01-conference.html"
xml2::write_html(page_1, file = htm_1)
onepetro_page_to_dataframe(htm_1)
## # A tibble: 1,000 x 6
##    title_data        paper_id   source   type     year author1_data       
##    <chr>             <chr>      <chr>    <chr>   <int> <chr>              
##  1 Pressure Transie~ "        ~ "      ~ "     ~  2003 Rabb, J., Petro-Ca~
##  2 Well-head Pressu~ "        ~ "      ~ "     ~  2013 Spyrou, Charidimos~
##  3 Pressure Transie~ "        ~ "      ~ "     ~  1994 Larsen, Leif, Stat~
##  4 Integrating Pres~ "        ~ "      ~ "     ~  2006 Rahim, Zillur, Res~
##  5 Automated Pressu~ "        ~ "      ~ "     ~  2011 Rees, Hugh Richard~
##  6 Pressure Transie~ "        ~ "      ~ "     ~  1995 Sahni, A., Univers~
##  7 How Wellbore Dyn~ "        ~ "      ~ "     ~  1991 Mattar, L., Fekete~
##  8 Numerical Soluti~ "        ~ "      ~ "     ~  1993 Warren, G.M., SIMT~
##  9 Software Showcas~ "        ~ "      ~ "     ~  1992 Baldwin, J.O., Con~
## 10 Pressure-Transie~ "        ~ "      ~ "     ~  1998 Yildiz, Turhan, Si~
## # ... with 990 more rows

Collect second set of 1000 rows

my_url_2 <- make_search_url(query = "pressure transient analysis", 
                          how = "all", 
                          dc_type = "conference-paper",
                          start = 1000,
                          rows  = 1000)

page_2 <- read_onepetro(my_url_2)
htm_2 <- "pta-02-conference.html"
xml2::write_html(page_2, file = htm_2)
onepetro_page_to_dataframe(htm_2)
## # A tibble: 1,000 x 6
##    title_data            paper_id   source  type    year author1_data     
##    <chr>                 <chr>      <chr>   <chr>  <int> <chr>            
##  1 Identification of Re~ "        ~ "     ~ "    ~  2017 Asalkhuzina, G. ~
##  2 Improving the Conduc~ "        ~ "     ~ "    ~  2016 Keshavarz, Alire~
##  3 Characterization of ~ "        ~ "     ~ "    ~  2016 Zhan, J., Univer~
##  4 A Comprehensive Mode~ "        ~ "     ~ "    ~  2016 Chen, Z. M., Chi~
##  5 The Value of Reservo~ "        ~ "     ~ "    ~  2017 Shbair, Alaa F.,~
##  6 Modeling the Depleti~ "        ~ "     ~ "    ~  2017 Zhang, Fengshou,~
##  7 Evaluation Of The Ef~ "        ~ "     ~ "    ~  2015 Ali, Tariq A., C~
##  8 Methods of Research ~ "        ~ "     ~ "    ~  2015 Davletbaev, A., ~
##  9 Identification and C~ "        ~ "     ~ "    ~  2015 Ding, Shuaiwei, ~
## 10 AOF Analysis of One ~ "        ~ "     ~ "    ~  2015 Pang, Wei, Sinop~
## # ... with 990 more rows

Collect next set of 1000 rows

my_url_3 <- make_search_url(query = "pressure transient analysis", 
                          how = "all", 
                          dc_type = "conference-paper",
                          start = 2000,
                          rows  = 1000)

page_3 <- read_onepetro(my_url_3)
htm_3 <- "pta-03-conference.html"
xml2::write_html(page_3, file = htm_3)
onepetro_page_to_dataframe(htm_3)
## # A tibble: 1,000 x 6
##    title_data              paper_id   source  type     year author1_data  
##    <chr>                   <chr>      <chr>   <chr>   <int> <chr>         
##  1 Analytical and Numeric~ "        ~ "     ~ "     ~  2013 Alharthy, Naj~
##  2 Data Acquisition In Pu~ "        ~ "     ~ "     ~  2006 Cimic, Miljen~
##  3 Characterisation and M~ "        ~ "     ~ "     ~  2012 Sirat, Manhal~
##  4 Estimating Non-Darcy F~ "        ~ "     ~ "     ~  2002 Spivey, J.P.,~
##  5 Radius of Investigatio~ "        ~ "     ~ "     ~  2009 Kuchuk, Fikri~
##  6 Examples of Advanced T~ "        ~ "     ~ "     ~  2012 Cardoso, Elme~
##  7 Analysis of Decline Cu~ "        ~ "     ~ "     ~  2014 Shahamat, M.S~
##  8 Application of Linear ~ "        ~ "     ~ "     ~  1982 Kohlhaas, Cha~
##  9 Rate Behavior of Compo~ "        ~ "     ~ "     ~  1991 Olarewaju, J.~
## 10 Pressure Transient and~ "        ~ "     ~ "     ~  2004 Fuentes-Cruz,~
## # ... with 990 more rows

Collect remaining set

my_url_4 <- make_search_url(query = "pressure transient analysis", 
                          how = "all", 
                          dc_type = "conference-paper",
                          start = 3000,
                          rows  = 100)

page_4 <- read_onepetro(my_url_4)
htm_4 <- "pta-04-conference.html"
xml2::write_html(page_4, file = htm_4)
onepetro_page_to_dataframe(htm_4)
## # A tibble: 100 x 6
##    title_data             paper_id  source  type     year author1_data    
##    <chr>                  <chr>     <chr>   <chr>   <int> <chr>           
##  1 Analysis of Coal Gas ~ "       ~ "     ~ "     ~  1993 Mavor, M.J., Re~
##  2 "Analysis of Pressure~ "       ~ "     ~ "     ~  1999 Tiab, Djebbar, ~
##  3 Gas Storage Capacity ~ "       ~ "     ~ "     ~  2014 Kang, Seungmo, ~
##  4 Fast-Marching Methods~ "       ~ "     ~ "     ~  2013 Zhang, Yanbin, ~
##  5 Pressure Transient Be~ "       ~ "     ~ "     ~  1983 Okpobiri, G.A.,~
##  6 Interwell Tracer Test~ "       ~ "     ~ "     ~  2005 Du, Yuqi, Chevr~
##  7 New Models for Time-C~ "       ~ "     ~ "     ~  2016 Yousuf, Wajid, ~
##  8 Numerical Simulation ~ "       ~ "     ~ "     ~  1986 Nakornthap, K.,~
##  9 Predicting Horizontal~ "       ~ "     ~ "     ~  1989 Chang, M-M., Na~
## 10 Production Data Analy~ "       ~ "     ~ "     ~  2008 Lewis, Adam Mic~
## # ... with 90 more rows

Binding tables in one dataframe

p1 <- onepetro_page_to_dataframe(htm_1)
p2 <- onepetro_page_to_dataframe(htm_2)
p3 <- onepetro_page_to_dataframe(htm_3)
p4 <- onepetro_page_to_dataframe(htm_4)

papers <- rbind(p1, p2, p3, p4)
papers
## # A tibble: 3,100 x 6
##    title_data        paper_id   source   type     year author1_data       
##    <chr>             <chr>      <chr>    <chr>   <int> <chr>              
##  1 Pressure Transie~ "        ~ "      ~ "     ~  2003 Rabb, J., Petro-Ca~
##  2 Well-head Pressu~ "        ~ "      ~ "     ~  2013 Spyrou, Charidimos~
##  3 Pressure Transie~ "        ~ "      ~ "     ~  1994 Larsen, Leif, Stat~
##  4 Integrating Pres~ "        ~ "      ~ "     ~  2006 Rahim, Zillur, Res~
##  5 Automated Pressu~ "        ~ "      ~ "     ~  2011 Rees, Hugh Richard~
##  6 Pressure Transie~ "        ~ "      ~ "     ~  1995 Sahni, A., Univers~
##  7 How Wellbore Dyn~ "        ~ "      ~ "     ~  1991 Mattar, L., Fekete~
##  8 Numerical Soluti~ "        ~ "      ~ "     ~  1993 Warren, G.M., SIMT~
##  9 Software Showcas~ "        ~ "      ~ "     ~  1992 Baldwin, J.O., Con~
## 10 Pressure-Transie~ "        ~ "      ~ "     ~  1998 Yildiz, Turhan, Si~
## # ... with 3,090 more rows

Find which papers have the search word in the title

pattern <- "pressure transient analysis"
rows <- grep(pattern = pattern, papers$title_data, ignore.case = TRUE)
papers[rows, ]
## # A tibble: 163 x 6
##    title_data        paper_id   source   type     year author1_data       
##    <chr>             <chr>      <chr>    <chr>   <int> <chr>              
##  1 Pressure Transie~ "        ~ "      ~ "     ~  2003 Rabb, J., Petro-Ca~
##  2 Well-head Pressu~ "        ~ "      ~ "     ~  2013 Spyrou, Charidimos~
##  3 Pressure Transie~ "        ~ "      ~ "     ~  1994 Larsen, Leif, Stat~
##  4 Integrating Pres~ "        ~ "      ~ "     ~  2006 Rahim, Zillur, Res~
##  5 Automated Pressu~ "        ~ "      ~ "     ~  2011 Rees, Hugh Richard~
##  6 Pressure Transie~ "        ~ "      ~ "     ~  1995 Sahni, A., Univers~
##  7 How Wellbore Dyn~ "        ~ "      ~ "     ~  1991 Mattar, L., Fekete~
##  8 Numerical Soluti~ "        ~ "      ~ "     ~  1993 Warren, G.M., SIMT~
##  9 Software Showcas~ "        ~ "      ~ "     ~  1992 Baldwin, J.O., Con~
## 10 Pressure Transie~ "        ~ "      ~ "     ~  1986 Clonts, M.D., ARCO~
## # ... with 153 more rows
# remove files that were created
files <- c(htm_1, htm_2, htm_3, htm_4)
file.remove(files)
## [1] TRUE TRUE TRUE TRUE