|
4 | 4 | #' @param format "simple" uses CSV and returns pure character data frame, while |
5 | 5 | #' "smart" fetches JSON-formatted data and returns a data frame with datetime |
6 | 6 | #' columns converted to `POSIXct` |
7 | | -#' @param ... Additional parameters to supply to [httr::POST] |
8 | | -#' @return A `data.frame` |
| 7 | +#' @return A tibble data frame |
9 | 8 | #' @examples |
10 | | -#' # R's versions and release dates: |
11 | | -#' sparql_query <- 'SELECT DISTINCT |
| 9 | +#' sparql_query <- "SELECT |
12 | 10 | #' ?softwareVersion ?publicationDate |
13 | 11 | #' WHERE { |
14 | 12 | #' BIND(wd:Q206904 AS ?R) |
15 | 13 | #' ?R p:P348 [ |
16 | 14 | #' ps:P348 ?softwareVersion; |
17 | 15 | #' pq:P577 ?publicationDate |
18 | 16 | #' ] . |
19 | | -#' }' |
| 17 | +#' }" |
20 | 18 | #' query_wikidata(sparql_query) |
21 | 19 | #' |
22 | 20 | #' \dontrun{ |
23 | | -#' # "smart" format converts all datetime columns to POSIXct |
24 | 21 | #' query_wikidata(sparql_query, format = "smart") |
25 | 22 | #' } |
| 23 | +#' @section Query limits: |
| 24 | +#' There is a hard query deadline configured which is set to 60 seconds. There |
| 25 | +#' are also following limits: |
| 26 | +#' - One client (user agent + IP) is allowed 60 seconds of processing time each |
| 27 | +#' 60 seconds |
| 28 | +#' - One client is allowed 30 error queries per minute |
| 29 | +#' See [query limits section](https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits) |
| 30 | +#' in the WDQS user manual for more information. |
26 | 31 | #' @seealso [get_example] |
27 | 32 | #' @export |
28 | | -query_wikidata <- function(sparql_query, format = c("simple", "smart"), ...) { |
29 | | - if (!format[1] %in% c("simple", "smart")) { |
| 33 | +query_wikidata <- function(sparql_query, format = c("simple", "smart")) { |
| 34 | + format <- format[1] |
| 35 | + if (!format %in% c("simple", "smart")) { |
30 | 36 | stop("`format` must be either \"simple\" or \"smart\"") |
31 | 37 | } |
32 | 38 | output <- lapply(sparql_query, function(sparql_query) { |
33 | | - if (format[1] == "simple") { |
34 | | - response <- httr::POST( |
35 | | - url = "https://query.wikidata.org/sparql", |
36 | | - query = list(query = sparql_query), |
37 | | - httr::add_headers(Accept = "text/csv"), |
38 | | - httr::user_agent("https://github.com/bearloga/WikidataQueryServiceR"), |
39 | | - ... |
40 | | - ) |
| 39 | + rate_limited_query <- wdqs_requester() |
| 40 | + if (format == "simple") { |
| 41 | + response <- rate_limited_query(sparql_query, httr::add_headers(Accept = "text/csv")) |
41 | 42 | httr::stop_for_status(response) |
42 | 43 | if (httr::http_type(response) == "text/csv") { |
43 | | - con <- textConnection(httr::content(response, as = "text", encoding = "UTF-8")) |
44 | | - df <- utils::read.csv(con, header = TRUE, stringsAsFactors = FALSE) |
45 | | - message(nrow(df), " rows were returned by WDQS") |
46 | | - return(df) |
| 44 | + content <- httr::content(response, as = "text", encoding = "UTF-8") |
| 45 | + return(readr::read_csv(content)) |
47 | 46 | } else { |
48 | 47 | stop("returned response is not formatted as a CSV") |
49 | 48 | } |
50 | 49 | } else { |
51 | | - response <- httr::GET( |
52 | | - url = "https://query.wikidata.org/sparql", |
53 | | - query = list(query = sparql_query), |
54 | | - format = "json", |
55 | | - httr::user_agent("https://github.com/bearloga/WikidataQueryServiceR"), |
56 | | - ... |
57 | | - ) |
| 50 | + response <- rate_limited_query(sparql_query, httr::add_headers(Accept = "application/sparql-results+json")) |
58 | 51 | httr::stop_for_status(response) |
59 | 52 | if (httr::http_type(response) == "application/sparql-results+json") { |
60 | | - temp <- jsonlite::fromJSON(httr::content(response, as = "text", encoding = "UTF-8"), simplifyVector = FALSE) |
| 53 | + content <- httr::content(response, as = "text", encoding = "UTF-8") |
| 54 | + temp <- jsonlite::fromJSON(content, simplifyVector = FALSE) |
61 | 55 | } |
62 | 56 | if (length(temp$results$bindings) > 0) { |
63 | | - df <- as.data.frame(dplyr::bind_rows(lapply(temp$results$bindings, function(x) { |
64 | | - return(lapply(x, function(y) { return(y$value) })) |
65 | | - }))) |
66 | | - datetime_cols <- vapply(temp$results$bindings[[1]], function(x) { |
67 | | - if ("datatype" %in% names(x)) { |
68 | | - return(x$datatype == "http://www.w3.org/2001/XMLSchema#dateTime") |
| 57 | + data_frame <- purrr::map_dfr(temp$results$bindings, function(binding) { |
| 58 | + return(purrr::map_chr(binding, ~ .x$value)) |
| 59 | + }) |
| 60 | + datetime_columns <- purrr::map_lgl(temp$results$bindings[[1]], function(binding) { |
| 61 | + if ("datatype" %in% names(binding)) { |
| 62 | + return(binding[["datatype"]] == "http://www.w3.org/2001/XMLSchema#dateTime") |
69 | 63 | } else { |
70 | 64 | return(FALSE) |
71 | 65 | } |
72 | | - }, FALSE) |
73 | | - if (any(datetime_cols)) { |
74 | | - for (datetime_col in which(datetime_cols)) { |
75 | | - df[[datetime_col]] <- as.POSIXct(df[[datetime_col]], format = "%Y-%m-%dT%H:%M:%SZ", tz = "GMT") |
76 | | - } |
77 | | - } |
78 | | - message(nrow(df), " rows were returned by WDQS") |
79 | | - return(df) |
| 66 | + }) |
| 67 | + data_frame <- dplyr::mutate_if( |
| 68 | + .tbl = data_frame, |
| 69 | + .predicate = datetime_columns, |
| 70 | + .funs = as.POSIXct, |
| 71 | + format = "%Y-%m-%dT%H:%M:%SZ", tz = "GMT" |
| 72 | + ) |
80 | 73 | } else { |
81 | | - message("0 rows were returned by WDQS") |
82 | | - return(data.frame(matrix(character(), nrow = 0, ncol = length(temp$head$vars), |
83 | | - dimnames = list(c(), unlist(temp$head$vars))), |
84 | | - stringsAsFactors = FALSE)) |
| 74 | + data_frame <- dplyr::as_tibble( |
| 75 | + matrix( |
| 76 | + character(), |
| 77 | + nrow = 0, ncol = length(temp$head$vars), |
| 78 | + dimnames = list(c(), unlist(temp$head$vars)) |
| 79 | + ) |
| 80 | + ) |
85 | 81 | } |
| 82 | + return(data_frame) |
86 | 83 | } |
87 | 84 | }) |
88 | 85 | if (length(output) == 1) { |
|
0 commit comments