In [1]:
## loading packages
using Base.Threads
using MurmurHash3
using Formatting
using Random
using Statistics
In [2]:
const targetFilename = "random_data.bin"
targetFileStat = stat(targetFilename)
numEntries = div(targetFileStat.size, 32)
@assert targetFileStat.size % 32 == 0
printfmtln("number of entries: {:d}", numEntries)
number of entries: 10000000
In [3]:
## generate seeds
const numHashFuncs = 60
Random.seed!(0x1a2b3c4d)
hashSeeds = rand(UInt32(0x000FFFFF):UInt32(0x0FFFFFFF), numHashFuncs)
Out[3]:
60-element Array{UInt32,1}:
 0x056927c0
 0x0925a38d
 0x0ee112b2
 0x01af6d7a
 0x06236ebf
 0x08d81e74
 0x03bcafeb
 0x0d9b4b92
 0x038a4c9a
 0x06b03801
 0x00e01486
 0x08e6ca55
 0x0cc27402
          ⋮
 0x08765d12
 0x01a2dc09
 0x026aecbc
 0x0d6fa770
 0x08e81aa3
 0x0857d260
 0x0491ea7e
 0x0526c813
 0x077ca08d
 0x042e7f6d
 0x06ec1bbb
 0x0c966717
In [4]:
## create a lookup table
const lookup = zeros(UInt64, 255)
for i in UInt8(1):UInt8(255)
    count = 0
    for j in 0:7
        shifted = i >> j
        if shifted & UInt8(1) == UInt8(0)
            count += 1
        else
            break
        end
    end
    #print(count, " ")
    lookup[i] = count
end
In [5]:
## function declaration

function count_tail_zeros(x::UInt64)
    byteArr = reinterpret(UInt8, [x])
    count = 0
    for i in 1:8
        if byteArr[i] == UInt8(0)
            count += 8
        else
            count += lookup[byteArr[i]]
            break
        end
    end
    return count
end

function flajolet_martin(startInd::UInt, endInd::UInt, seeds::Array{UInt32, 1})
    startLoc = startInd * 32
    endLoc = endInd * 32
    fileLength = endLoc - startLoc
    # open and read the buffer
    inFile = open(targetFilename, "r")
    seek(inFile, startLoc)
    buffer = read(inFile, fileLength)
    close(inFile)
    @assert length(buffer) == fileLength

    # create a c buffer and use its pointer
    cPtr = convert(Ptr{UInt8}, pointer_from_objref(buffer))

    maxR = zeros(UInt, numHashFuncs)

    for i in 1:32:fileLength
        hashVals = [mmhash128_a(32, cPtr + i, seedVal)[1] for seedVal in seeds]
        r = [count_tail_zeros(val) for val in hashVals]
        for j in 1:length(maxR)
            maxR[j] = max(maxR[j], r[j])
        end
    end

    return maxR

end
Out[5]:
flajolet_martin (generic function with 1 method)
In [6]:
# test counting function
count_tail_zeros(UInt64(2^16))
Out[6]:
0x0000000000000010
In [7]:
## split the task
numTask = 100
numEntriesPerTask = UInt(ceil(numEntries / numTask))
printfmtln("number of entries per task: {:d}", numEntriesPerTask)
number of entries per task: 100000
In [8]:
function compute_distinct_elements()
    # run with multiple threads
    println("number of threads:", nthreads())
    result = zeros(UInt, (numTask, numHashFuncs))

    @threads for i in 1:numTask
    startInd = UInt(i - 1) * UInt(numEntriesPerTask)
    endInd = min(UInt(numEntries), UInt(i) * UInt(numEntriesPerTask))
    result[i, :] .= flajolet_martin(startInd, endInd, hashSeeds)
    println("task $i finished")
    end

    result
end


@elapsed result = compute_distinct_elements()
number of threads:6
task 1 finished
task 18 finished
task 52 finished
task 35 finished
task 69 finished
task 85 finished
task 2 finished
task 19 finished
task 53 finished
task 36 finished
task 70 finished
task 86 finished
task 3 finished
task 20 finished
task 54 finished
task 37 finished
task 71 finished
task 87 finished
task 4 finished
task 21 finished
task 55 finished
task 38 finished
task 88 finished
task 72 finished
task 5 finished
task 56 finished
task 22 finished
task 39 finished
task 89 finished
task 6 finished
task 73 finished
task 57 finished
task 7 finished
task 23 finished
task 40 finished
task 90 finished
task 74 finished
task 8 finished
task 58 finished
task 24 finished
task 41 finished
task 91 finished
task 75 finished
task 9 finished
task 59 finished
task 25 finished
task 42 finished
task 76 finished
task 92 finished
task 10 finished
task 60 finished
task 43 finished
task 77 finished
task 26 finished
task 93 finished
task 11 finished
task 44 finished
task 61 finished
task 78 finished
task 27 finished
task 94 finished
task 12 finished
task 45 finished
task 62 finished
task 79 finished
task 28 finished
task 95 finished
task 13 finished
task 46 finished
task 63 finished
task 80 finished
task 29 finished
task 14 finished
task 96 finished
task 47 finished
task 64 finished
task 15 finished
task 30 finished
task 81 finished
task 97 finished
task 16 finished
task 48 finished
task 65 finished
task 31 finished
task 82 finished
task 98 finished
task 17 finished
task 49 finished
task 66 finished
task 32 finished
task 83 finished
task 99 finished
task 50 finished
task 67 finished
task 33 finished
task 84 finished
task 100 finished
task 51 finished
task 68 finished
task 34 finished
Out[8]:
33.616328495
In [9]:
maxResult = mapslices(maximum, result, dims=[1])
println(maxResult)
medians = Array{Float64, 1}(undef, 0)
groups = 6
numElePerGroup = Int(numHashFuncs / groups)
for i in 1:groups
    slice = maxResult[(i-1)*numElePerGroup + 1 : i*numElePerGroup]
    push!(medians, median(slice))
end
meanVal = mean(medians)
println(meanVal)
numDistinct = 2^meanVal
UInt64[0x0000000000000018 0x0000000000000017 0x0000000000000016 0x0000000000000016 0x0000000000000014 0x0000000000000017 0x0000000000000013 0x000000000000001a 0x0000000000000016 0x0000000000000014 0x0000000000000013 0x0000000000000014 0x0000000000000016 0x0000000000000018 0x0000000000000017 0x0000000000000013 0x0000000000000014 0x0000000000000017 0x0000000000000015 0x0000000000000014 0x0000000000000014 0x0000000000000015 0x0000000000000016 0x0000000000000017 0x0000000000000014 0x0000000000000018 0x0000000000000016 0x0000000000000014 0x0000000000000016 0x0000000000000019 0x0000000000000015 0x0000000000000018 0x0000000000000014 0x0000000000000018 0x0000000000000015 0x0000000000000016 0x0000000000000015 0x0000000000000018 0x0000000000000018 0x0000000000000014 0x0000000000000015 0x0000000000000014 0x0000000000000017 0x0000000000000017 0x0000000000000013 0x0000000000000018 0x0000000000000015 0x0000000000000012 0x0000000000000016 0x0000000000000016 0x0000000000000014 0x0000000000000016 0x0000000000000016 0x0000000000000015 0x0000000000000015 0x0000000000000014 0x0000000000000017 0x0000000000000015 0x0000000000000015 0x0000000000000015]
21.416666666666668
Out[9]:
2.7993620698523982e6