【发布时间】:2019-06-22 23:47:59
【问题描述】:
基于How to collect paginated API responses using spring boot WebClient?
我创建了以下爬虫类
class GitlabCrawler(private val client: WebClient, private val token: String) {
fun fetchCommits(project: URI): Flux<Commit> {
return fetchCommitsInternal(project).expand { cr: ClientResponse? ->
val nextUrl = getNextUrl(cr)
nextUrl?.let { fetchCommitsInternal(URI.create(it)) }
?: Mono.empty<ClientResponse>()
}.limitRate(1)
.flatMap { cr: ClientResponse? -> cr?.bodyToFlux(Commit::class.java) ?: Flux.empty() }
}
private fun getNextUrl(cr: ClientResponse?):String? {
// TODO replace with proper link parsing
return cr?.headers()?.header(HttpHeaders.LINK)?.firstOrNull()
?.splitToSequence(",")
?.find { it.endsWith("rel=\"next\"") }
?.let { it.substring(it.indexOf('<') + 1, it.lastIndexOf('>')) }
}
private fun fetchCommitsInternal(url: URI): Mono<ClientResponse> {
return client.get()
.uri(url)
.accept(MediaType.APPLICATION_JSON_UTF8)
.header("Private-Token", token)
.exchange()
}
}
data class Commit(
val id: String,
val message: String,
@JsonProperty("parent_ids") val parentIds: List<String>,
@JsonProperty("created_at") val createdAt: String)
我想避免不必要的请求,但它执行的请求比完成请求所需的要多。
gitlabCrawler.fetchCommits(URI.create("https://...")).take(15).collectList().block()
只需要一个请求,因为每个页面包含 20 个条目,但它会启动第二个页面请求。它似乎总是要求多一页而不是必要的。我尝试使用limitRate,但这似乎没有效果。
有没有办法让它变得懒惰,即只有在当前用尽时才请求下一页?
【问题讨论】:
标签: kotlin spring-webflux project-reactor