这是我基于 C# 的关于如何解决此问题的想法,可能不是完全解决它,而是提高性能。下面显示的代码解决了“在存储库的默认分支中检索所有提交”的问题,但是,它可以应用于 GitHub GraphQL 上几乎任何基于光标的分页场景。我知道您的问题是关于“所有分支的所有提交,重复数据删除”,但是,我相信这种方法也可能对您有用。
查询大型存储库的固有问题是每页限制为 100 个结果,并且您必须逐页迭代,因为每个页面都包含指向下一页的光标。我已经解决了我的解决方案中的光标识别问题,通过同时发送所有页面请求减少了整体执行时间。
这个想法是创建一个对 GitHub GraphQL API 的初始请求,只获取给定过滤器的总数。我假设我们每页会获取 100 个结果。由于 GitHub 提交页面游标始终采用“xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 99”格式,其中第一部分是第一次提交 oid(第一页的第一次提交 - 所有页面上的所有游标都使用此 oid - 它在迭代时不会改变),而 99 是上一页的最后一次提交的顺序号(基于 0 的索引),只需发出“totalCount”请求,就可以很容易地计算出 670 次提交存储库的每一页的游标是多少:
- 空
- “xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 99”
- “xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 199”
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 299"
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 399"
- “xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 499”
- “xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 599”
在生成标识每个页面开头的游标后,我们可以为每个页面准备一个单独的Task,其中Task 将包含对GitHub GraphQL 获取一个页面的请求,并使用Task.WhenAll全部执行。
我已经在一个包含 670 个提交的存储库上对此进行了测试,所有 7 个页面总共在大约 7 秒内被获取。如果我遍历每一页,每页大约需要 4 秒,总共需要 25 - 30 秒。
需要注意的是,这不是在生产环境中测试的,它不涉及错误处理,并且并行/并发实现很可能可以改进,所以它只能被视为概念证明。此外,我不确定当您发送对具有 100 或 1000 个提交页面的存储库的请求时,GitHub API 将如何处理。
public async Task<List<Commit>> GetCommitsByPeriodAsync(Guid integrationId, DateTime since, string repositoryName, string repositoryOwner)
{
string initialCursor = null;
var firstPageInfo = await GetDefaultBranchCommitsFirstPageInfoAsync(since, initialCursor, repositoryOwner, repositoryName);
var commitPagesCursors = GetCommitPagesCursors(firstPageInfo, initialCursor );
var tasks = commitPagesCursors.Select(x => GetDefaultBranchCommitsPageByPeriodAsync(since, x, repositoryOwner, repositoryName));
var results = await Task.WhenAll(tasks);
var branchCommitsByPeriod = results.SelectMany(x => x.Commits)
.ToList();
return branchCommitsByPeriod;
}
private List<string> GetCommitPagesCursors(GetCommitsPageInfoResponse firstPageInfo, string initialCursor)
{
// Two initial cursors will always be "null", and "oid 99" for 100 items pages
var cursors = new List<string> { initialCursor, firstPageInfo.PageInfo.EndCursor };
int totalCount = firstPageInfo.TotalCount;
var firstCommitCursorSplit = firstPageInfo.PageInfo.EndCursor.Split(" ");
var firstCommitId = firstCommitCursorSplit[0];
var lastPageCommitNumberString = firstCommitCursorSplit[1];
// TO DO: handling TryParse failure scenario
int.TryParse(lastPageCommitNumberString, out int lastPageCommitNumber);
// 100 is the max number of objects in a page
lastPageCommitNumber += 100;
while (lastPageCommitNumber < totalCount)
{
string nextPageCursor = $"{firstCommitId} {lastPageCommitNumber}";
cursors.Add(nextPageCursor);
lastPageCommitNumber += 100;
}
return cursors;
}
public async Task<GetCommitsPageInfoResponse> GetDefaultBranchCommitsFirstPageInfoAsync(DateTime since, string cursor, string repositoryOwner, string repositoryName)
{
// Code omitted for brevity
var commitsRequest = new GraphQLRequest
{
Query = @"
query GetCommitsFirstPage($cursor: String, $commitsSince: GitTimestamp!, $repositoryName: String!, $repositoryOwner: String!) {
repository(name: $repositoryName, owner: $repositoryOwner) {
defaultBranchRef{
target {
... on Commit {
history(after: $cursor, since: $commitsSince) {
totalCount
pageInfo {
endCursor
hasNextPage
}
}
}
}
}
}
}",
OperationName = "GetCommitsFirstPage",
Variables = new
{
commitsSince = since.ToString("o"),
cursor = cursor,
repositoryOwner = repositoryOwner,
repositoryName = repositoryName
}
};
// Code omitted for brevity
}
public async Task<GetCommitsPageResponse> GetDefaultBranchCommitsPageByPeriodAsync(DateTime since, string cursor, string repositoryOwner, string repositoryName)
{
// Code omitted for brevity
var commitsRequest = new GraphQLRequest
{
Query = @"
query GetCommitsSinceTimestamp($cursor: String, $commitsSince: GitTimestamp!, $repositoryName: String!, $repositoryOwner: String!) {
repository(name: $repositoryName, owner: $repositoryOwner) {
defaultBranchRef{
target {
... on Commit {
history(after: $cursor, since: $commitsSince) {
pageInfo {
endCursor
hasNextPage
}
edges {
node {
oid
additions
deletions
commitUrl
url
committedDate
associatedPullRequests (first: 10) {
nodes {
id
mergedAt
}
}
repository {
databaseId
nameWithOwner
}
author {
name
email
user {
login
}
}
message
}
}
}
}
}
}
}
}",
OperationName = "GetCommitsSinceTimestamp",
Variables = new
{
commitsSince = since.ToString("o"),
cursor = cursor,
repositoryOwner = repositoryOwner,
repositoryName = repositoryName
}
};
// Code omitted for brevity
}