# Install packages
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse, # tidyverse pkgs including purrr
tictoc, # performance test
rvest) # web scraping
safely()
and possibly()
to make error handling easierChallenge 1
map(url_lists, read_html)
url_lists <- c("https://en.wikipedia.org/wiki/University_of_California,_Berkeley",
"https://en.wikipedia.org/wiki/Stanford_University",
"https://en.wikipedia.org/wiki/Carnegie_Mellon_University",
"https://DLAB"
)
map(url_lists, read_html)
There are three kinds of messages you will run into, if your code has an error based on the following functions.
The basic logic of try-catch
, R’s basic error handling function, works like the following.
tryCatch(
{map(url_lists, read_html)
}, warning = function(w) {
"Warning"
}, error = function(e) {
"Error"
}, finally = {
"Message"
})
## [1] "Error"
purrr
version of the try-catch
mechanism (evaluates code and assigns exception handlers).Outputs
NULL
NULL
or error
map(url_lists, safely(read_html))
## [[1]]
## [[1]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
##
## [[1]]$error
## NULL
##
##
## [[2]]
## [[2]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
##
## [[2]]$error
## NULL
##
##
## [[3]]
## [[3]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
##
## [[3]]$error
## NULL
##
##
## [[4]]
## [[4]]$result
## NULL
##
## [[4]]$error
## <simpleError in open.connection(x, "rb"): Could not resolve host: DLAB>
map(url_lists, safely(read_html)) %>%
map("result") %>% # = map(function(x) x[["result"]]) = map(~.x[["name"]])
purrr::compact() # Remove empty elements
## [[1]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
##
## [[2]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
##
## [[3]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
What if the best way to solve the problem is not ignoring the error …
# If error occurred, "The URL is broken." will be stored in that element(s).
out <- map(url_lists,
possibly(read_html,
otherwise = "The URL is broken.")
)
# Let's find the broken URL.
url_lists[out[seq(out)] == "The URL is broken."]
## [1] "https://DLAB"