考虑 XML 的 htmlParse 在 url 上使用 readLines 并运行相同的 XPath 表达式。下面使用 lapply 中的索引 [1:5] 处理前 5 个 CSV(weatherFiles 的长度为 3,344)。
url <- "https://sci.ncas.ac.uk/leedsweather/Archive/"
doc <- htmlParse(readLines(url))
## query the url to get all the file names ending in '.csv'
weatherFiles <- XML::xpathSApply(doc, path="//a/@href['.csv'=substring(., string-length(.) - 3)]")
df_list <- lapply(weatherFiles[1:5], function(f) {
# DOWNLOAD FILE LOCALLY USING *wb* MODE
download.file(paste0(url, f), paste0(getwd(), "/", f), mode="wb")
# IMPORT WITH TRYCATCH FOR PROBLEMATIC CSVS
tryCatch(read.csv(paste0("weather_", f), stringsAsFactors = FALSE), error = function(e) NULL)
})
# REMOVE NULLS
df_list <- Filter(function(x) !is.null(x), df_list)
# CLEAN UP NAMES (MAY NEED TO ADD)
df_list <- lapply(df_list, function(df) {
clean_names <- gsub("Winddir", "WindDir", gsub("\\.", "", gsub("\\.\\.\\..*$", "", names(df))))
setNames(df, clean_names)
})
# DATA FRAME APPEND
# final_df <- base::do.call(rbind, df_list) # FAILS IF COLUMN NUMBER AND NAMES DO NOT MATCH
final_df <- plyr::ldply(unname(df_list), data.frame)
final_df <- dplyr::bind_rows(df_list)
输出 (前 5 个 CSV)
结构
str(final_df)
'data.frame': 1152 obs. of 20 variables:
$ Timestamp : chr "2018-07-15 00:00:00" "2018-07-15 00:05:00" "2018-07-15 00:10:00" "2018-07-15 00:15:00" ...
$ Temp : num 64 63.9 63.7 63.5 63.4 ...
$ Chill : num 64 63.9 63.7 63.5 63.4 ...
$ HIndex : num 64 63.9 63.7 63.5 63.4 ...
$ Humid : num 75 75 75.8 76 76.2 ...
$ Dewpt : num 55.9 55.8 55.9 55.8 55.8 ...
$ Wind : num 2.09 1.6 2.66 2.62 2.68 ...
$ HiWind : num 5 5 5 6 6 5 5 5 5 6 ...
$ WindDir : num 293 312 305 310 325 ...
$ Rain : num 0 0 0 0 0 0 0 0 0 0 ...
$ RainRate : num 0 0 0 0 0 0 0 0 0 0 ...
$ Barom : num 29.7 29.7 29.7 29.7 29.7 ...
$ Solar : num 0 0 0 0 0 0 0 0 0 0 ...
$ ET : num 0.000254 0.000253 0.000248 0.000243 0.000238 ...
$ UV : num 0 0 0 0 0 0 0 0 0 0 ...
$ InsideTemp : num 75.6 75.6 75.6 75.6 75.6 ...
$ InsideHumid : num 46.8 47 47 47 47 ...
$ TimestampUTC: chr NA NA NA NA ...
$ Pressure : num NA NA NA NA NA NA NA NA NA NA ...
$ Radiation : num NA NA NA NA NA NA NA NA NA NA ...
行
head(final_df)
Timestamp Temp Chill HIndex Humid Dewpt Wind HiWind WindDir Rain RainRate Barom
1 2018-07-15 00:00:00 63.99530 63.99530 63.99530 75.00000 55.90705 2.087248 5 292.7708 0 0 29.72160
2 2018-07-15 00:05:00 63.85101 63.85101 63.85101 75.00000 55.76779 1.597315 5 312.1093 0 0 29.72003
3 2018-07-15 00:10:00 63.71074 63.71074 63.71074 75.77852 55.91708 2.657718 5 304.5494 0 0 29.71821
4 2018-07-15 00:15:00 63.54564 63.54564 63.54564 76.00000 55.83850 2.617450 6 309.9667 0 0 29.71859
5 2018-07-15 00:20:00 63.38658 63.38658 63.38658 76.21477 55.76223 2.684564 6 324.8235 0 0 29.71940
6 2018-07-15 00:25:00 63.25800 63.25800 63.25800 77.00000 55.92122 2.206667 5 325.4155 0 0 29.71861
Solar ET UV InsideTemp InsideHumid TimestampUTC Pressure Radiation
1 0 0.0002544508 0 75.6 46.79866 <NA> NA NA
2 0 0.0002525977 0 75.6 47.00000 <NA> NA NA
3 0 0.0002480646 0 75.6 47.00000 <NA> NA NA
4 0 0.0002431687 0 75.6 47.00000 <NA> NA NA
5 0 0.0002382909 0 75.6 47.00000 <NA> NA NA
6 0 0.0002372467 0 75.6 47.00000 <NA> NA NA