October 24, 2008
bearophile Wrote:
> There's a third possible solution, that is very fast: assuming the hash values are uint (32 bit), then you can create a bitfield of 2^32 bits, it requires just 1/2 GB, that is 1/4 of my RAM. So each bit represents a possible different value of the hash.

If the file is "small" my first implementation is good enough (small means the associative array has to fit in for example 2 GB). For larger files here's the implementation of the third idea, with D1, Phobos + my libs:

import d.all, std.string, std.c.stdlib, std.outofmemory;

const uint bpc = uint.sizeof * 8;

bool isFlagSet(uint* flags, uint i) {
    int offset = i / bpc;
    uint mask = 1 << (i % bpc);
    return (flags[offset] & mask) != 0;
}

void setFlag(uint* flags, uint i) {
    int offset = i / bpc;
    uint mask = 1 << (i % bpc);
    if ((flags[offset] & mask) == 0)
        flags[offset] |= mask;
}

void main() {
    uint* seen = cast(uint*)calloc(1 << 29, 1);
    uint* seen_again = cast(uint*)calloc(1 << 29, 1);
    if (seen is null || seen_again is null)
        throw new OutOfMemoryException();

    foreach (line; xfile("data.txt")) {
        uint h = hash(line.chomp());
        if (isFlagSet(seen, h))
            // not used std.intrinsics, because they segfault here
            setFlag(seen_again, h);
        else
            setFlag(seen, h);
    }
    free(seen);

    int[string] aa;
    int horizon;
    foreach (nline, line; xfile("data.txt")) {
        auto clean_line = line.chomp();
        if (isFlagSet(seen_again, hash(clean_line))) {
            auto pos_ptr = clean_line in aa;
            if (pos_ptr) {
                if ((nline - *pos_ptr) > horizon)
                     horizon = nline - *pos_ptr;
            } else
                aa[clean_line.dup] = nline;
        }
    }

    putr(horizon);
}


With D1, just Phobos:

import std.stdio, std.string, std.c.stdlib, std.stream, std.outofmemory;

const uint bpc = uint.sizeof * 8;

bool isFlagSet(uint* flags, uint i) {
    int offset = i / bpc;
    uint mask = 1 << (i % bpc);
    return (flags[offset] & mask) != 0;
}

void setFlag(uint* flags, uint i) {
    int offset = i / bpc;
    uint mask = 1 << (i % bpc);
    if ((flags[offset] & mask) == 0)
        flags[offset] |= mask;
}

void main() {
    uint* seen = cast(uint*)calloc(1 << 29, 1);
    uint* seen_again = cast(uint*)calloc(1 << 29, 1);
    if (seen is null || seen_again is null)
        throw new OutOfMemoryException();

    foreach (string line; new BufferedFile("data.txt")) {
        auto clean_line = line.chomp();
        uint h = typeid(string).getHash(&clean_line);
        if (isFlagSet(seen, h))
            // not used std.intrinsics, because they segfault here
            setFlag(seen_again, h);
        else
            setFlag(seen, h);
    }
    free(seen);

    int[string] aa;
    int horizon, nline;
    foreach (string line; new BufferedFile("data.txt")) {
        auto clean_line = line.chomp();
        if (isFlagSet(seen_again, typeid(string).getHash(&clean_line))) {
            auto pos_ptr = clean_line in aa;
            if (pos_ptr) {
                if ((nline - *pos_ptr) > horizon)
                     horizon = nline - *pos_ptr;
            } else
                aa[clean_line.dup] = nline;
        }
        nline++;
    }

    writefln(horizon);
}


The worst case is like this:
A
B
C
D
E
E
D
C
B
A

In this case the aa associative array has to store half of the lines of the file, and it be too much.

As you can see with this little program I have found another bug in Phobos, the intrinsics in this case don't work (and I have improved the Record of my libs too, used in another version of this code).

Bye,
bearophile
Top | Discussion index | About this forum | D home