“(不一定按此顺序)” 表示您可以进行排序的数字输出。由于您只需要跟踪大于或等于 n 的值,并且输入非常大的 integer 值,因此明智的方法是跟踪 integer范围而不是值。
很大程度上取决于输入。对于具有离散均匀分布的hughes字整数输入,它会消耗更少的内存并且速度会更快。
一个简单的伪代码实现需要每个输入值对照一个最小堆排序的范围堆栈:
Range is
min as integer
max as integer
next as Range
duplicates as stack of integer
Range(pMin, pMax, pNext)
self.min = pMin
self.max = pMax
self.next = pNext
self.duplicates = empty stack of integer
Range heap_top = NULL
Range current_range = NULL
Range previous_range = NULL
boolean merge_flag
integer value
While read value from input
if value >= n Then
current_range = heap_top
previous_range = NULL
merge_flag = false
While current_range is not null
If value is current_range.min - 1 Then
current_range.min = value
merge_flag = true
Break
End If
If value is current_range.max + 1 Then
current_range.max = value
merge_flag = true
Break
End If
If value < current_range.min Then
current_range = NULL
Break
End If
If value is between current_range.min and current_range.max Then
# Here we track duplicates value
current_range.duplicates.push value
Break
End If
previous_range = current_range
current_range = current_range->next
End While
If current_range is not NULL Then
If merge_flag is true Then
If previous_range is not NULL and current_range.min - 1 is previous_range.max Then
# merge current range into previous one
previous_range.max = current_range.max
# Here we track duplicates value
previous_range.duplicates.pushall current_range.duplicates
previous_range.next = current_range.next
drop current_range
# If we need to keep a track of the range where belong the value
# current_range = previous_range
Else
If current_range.next is not NULL and current_range.max + 1 is
current_range.next.min Then
# merge next range into current one
# We use previous_range to point the next range
previous_range = current_range.next
current_range.max = previous_range.max
# Here we track duplicates value
current_range.duplicates.pushall previous_range.duplicates
current_range.next = previous_range.next
drop previous_range
End If
End If
End If
Else
If previous_range is NULL Then
current_range = new Range(value, value, heap_top)
heap_top = current_range
Else
current_range = new Range(value, value, previous_range.next)
previous_range.next = current_range
End If
End If
End If
End While
如果输入是均匀分布的,那么从长远来看,更少的节点意味着更少的节点遍历处理。对每个要处理的输入值进行更少的节点遍历处理意味着更快的全局处理,因为我们有一个接近 O(N) 而不是 O(N!) 的算法,其中 N 作为输入值的数量。
上一个算法的C实现示例:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
struct s_range {
unsigned int min;
unsigned int max;
struct s_range *next;
unsigned int *duplicates;
unsigned int duplicates_count;
};
struct s_range *new_range(unsigned int pMin,unsigned int pMax, struct s_range *pNext) {
struct s_range *lRange = malloc(sizeof(struct s_range));
if (lRange == NULL) {
perror("new_range: Failed to allocate");
return NULL;
}
lRange->min = pMin;
lRange->max = pMax;
lRange->next = pNext;
lRange->duplicates = NULL;
lRange->duplicates_count = 0;
return lRange;
}
void drop_range(struct s_range *pRange) {
if (pRange != NULL) {
if (pRange->duplicates != NULL) {
free(pRange->duplicates);
}
free(pRange);
}
}
void drop_allranges(struct s_range *pTopRange) {
struct s_range *lRange;
while (pTopRange != NULL) {
lRange = pTopRange->next;
drop_range(pTopRange);
pTopRange = lRange;
}
}
int push_duplicates(struct s_range *pRange, unsigned int pValue) {
unsigned int *lDuplicates;
if (pRange == NULL) {
return 2;
}
if (pRange->duplicates == NULL) {
lDuplicates = malloc(sizeof(unsigned int));
} else {
lDuplicates = realloc(pRange->duplicates, (pRange->duplicates_count + 1) * sizeof(unsigned int));
}
if (lDuplicates == NULL) {
perror("push_duplicates: failed to allocate...");
return 1;
}
lDuplicates[pRange->duplicates_count++] = pValue;
pRange->duplicates = lDuplicates;
return 0;
}
int pushall_duplicates(struct s_range *pRangeDst, struct s_range *pRangeSrc) {
unsigned int *lDuplicates;
if (pRangeDst == NULL || pRangeSrc == NULL) {
return 2;
}
if (pRangeSrc->duplicates == NULL) {
return 0;
}
if (pRangeDst->duplicates == NULL) {
lDuplicates = malloc(pRangeSrc->duplicates_count * sizeof(unsigned int));
} else {
lDuplicates = realloc(pRangeDst->duplicates, (pRangeDst->duplicates_count + pRangeSrc->duplicates_count) * sizeof(unsigned int));
}
if (lDuplicates == NULL) {
perror("pushall_duplicates: failed to allocate...");
return 1;
}
memcpy(&lDuplicates[pRangeDst->duplicates_count], pRangeSrc->duplicates, pRangeSrc->duplicates_count * sizeof(unsigned int));
pRangeDst->duplicates_count += pRangeSrc->duplicates_count;
pRangeDst->duplicates = lDuplicates;
return 0;
}
int main(int nbargs, char *argv[]) {
struct s_range *lHeapTop = NULL;
struct s_range *lCurrentRange;
struct s_range *lPreviousRange;
unsigned int lMergeFlag;
unsigned int lValue;
unsigned int lN = 3;
unsigned int lDispFlag = 0;
if (nbargs > 1) {
lN = atoi(argv[1]);
}
if (nbargs > 2) {
lDispFlag = atoi(argv[2]);
}
while(fread(&lValue, sizeof(unsigned int), 1, stdin) > 0) {
if (lValue >= lN) {
lCurrentRange = lHeapTop;
lPreviousRange = NULL;
lMergeFlag = 0;
while(lCurrentRange != NULL) {
if (lCurrentRange->min - 1 == lValue) {
lCurrentRange->min = lValue;
lMergeFlag = 1;
break;
}
if (lCurrentRange->max + 1 == lValue) {
lCurrentRange->max = lValue;
lMergeFlag = 1;
break;
}
if (lValue < lCurrentRange->min) {
lCurrentRange = NULL;
break;
}
if (lValue >= lCurrentRange->min && lValue <= lCurrentRange->max) {
if (push_duplicates(lCurrentRange, lValue) != 0) {
drop_allranges(lHeapTop);
return 1;
}
break;
}
lPreviousRange = lCurrentRange;
lCurrentRange = lCurrentRange->next;
}
if (lCurrentRange != NULL) {
if (lMergeFlag == 1) {
if (lPreviousRange != NULL && lCurrentRange->min - 1 == lPreviousRange->max) {
lPreviousRange->max = lCurrentRange->max;
if (pushall_duplicates(lPreviousRange, lCurrentRange) != 0) {
drop_allranges(lHeapTop);
return 1;
}
lPreviousRange->next = lCurrentRange->next;
drop_range(lCurrentRange);
} else {
if (lCurrentRange->next != NULL && lCurrentRange->max + 1 == lCurrentRange->next->min) {
lPreviousRange = lCurrentRange->next;
lCurrentRange->max = lPreviousRange->max;
if (pushall_duplicates(lCurrentRange, lPreviousRange) != 0) {
drop_allranges(lHeapTop);
return 1;
}
lCurrentRange->next = lPreviousRange->next;
drop_range(lPreviousRange);
}
}
}
} else {
if (lPreviousRange == NULL) {
lCurrentRange = new_range(lValue, lValue, lHeapTop);
if (lCurrentRange == NULL) {
drop_allranges(lHeapTop);
return 1;
}
lHeapTop = lCurrentRange;
} else {
lCurrentRange = new_range(lValue, lValue, lPreviousRange->next);
if (lCurrentRange == NULL) {
drop_allranges(lHeapTop);
return 1;
}
lPreviousRange->next = lCurrentRange;
}
}
}
}
// Check the results
if (lDispFlag == 1) {
lCurrentRange = lHeapTop;
while(lCurrentRange != NULL) {
printf("From %u to %u dup:", lCurrentRange->min, lCurrentRange->max);
for (unsigned int lInd = 0; lInd < lCurrentRange->duplicates_count; lInd++) {
printf(" %u", lCurrentRange->duplicates[lInd]);
}
printf("\n");
lCurrentRange = lCurrentRange->next;
}
}
// Cleaning
drop_allranges(lHeapTop);
return 0;
}
在 x64 Debian Buster (4670K CPU) 上使用 离散均匀分布 集 65 536 个字,此代码(range 可执行文件)比经典最小堆一(node 可执行文件):
bash:~$ awk 'BEGIN { for (idx=0;idx<65536;idx++) { v=rand()*256; v2=rand()*256; printf("%c%c%c%c",v,v2,0,0); }}' > data.bin
bash:~$ time cat data.bin | ./range 3
real 0m5.629s
user 0m5.516s
sys 0m0.031s
bash:~$ time cat data.bin | ./node 3
real 0m15.618s
user 0m15.328s
sys 0m0.016s