View mode: basic / threaded / horizontal-split · Log in · Help
October 24, 2008
Re: [OT] The horizon of a stream
bearophile Wrote:
> There's a third possible solution, that is very fast: assuming the hash values are uint (32 bit), then you can create a bitfield of 2^32 bits, it requires just 1/2 GB, that is 1/4 of my RAM. So each bit represents a possible different value of the hash.

If the file is "small" my first implementation is good enough (small means the associative array has to fit in for example 2 GB). For larger files here's the implementation of the third idea, with D1, Phobos + my libs:

import d.all, std.string, std.c.stdlib, std.outofmemory;

const uint bpc = uint.sizeof * 8;

bool isFlagSet(uint* flags, uint i) {
   int offset = i / bpc;
   uint mask = 1 << (i % bpc);
   return (flags[offset] & mask) != 0;
}

void setFlag(uint* flags, uint i) {
   int offset = i / bpc;
   uint mask = 1 << (i % bpc);
   if ((flags[offset] & mask) == 0)
       flags[offset] |= mask;
}

void main() {
   uint* seen = cast(uint*)calloc(1 << 29, 1);
   uint* seen_again = cast(uint*)calloc(1 << 29, 1);
   if (seen is null || seen_again is null)
       throw new OutOfMemoryException();

   foreach (line; xfile("data.txt")) {
       uint h = hash(line.chomp());
       if (isFlagSet(seen, h))
           // not used std.intrinsics, because they segfault here
           setFlag(seen_again, h);
       else
           setFlag(seen, h);
   }
   free(seen);

   int[string] aa;
   int horizon;
   foreach (nline, line; xfile("data.txt")) {
       auto clean_line = line.chomp();
       if (isFlagSet(seen_again, hash(clean_line))) {
           auto pos_ptr = clean_line in aa;
           if (pos_ptr) {
               if ((nline - *pos_ptr) > horizon)
                    horizon = nline - *pos_ptr;
           } else
               aa[clean_line.dup] = nline;
       }
   }

   putr(horizon);
}


With D1, just Phobos:

import std.stdio, std.string, std.c.stdlib, std.stream, std.outofmemory;

const uint bpc = uint.sizeof * 8;

bool isFlagSet(uint* flags, uint i) {
   int offset = i / bpc;
   uint mask = 1 << (i % bpc);
   return (flags[offset] & mask) != 0;
}

void setFlag(uint* flags, uint i) {
   int offset = i / bpc;
   uint mask = 1 << (i % bpc);
   if ((flags[offset] & mask) == 0)
       flags[offset] |= mask;
}

void main() {
   uint* seen = cast(uint*)calloc(1 << 29, 1);
   uint* seen_again = cast(uint*)calloc(1 << 29, 1);
   if (seen is null || seen_again is null)
       throw new OutOfMemoryException();

   foreach (string line; new BufferedFile("data.txt")) {
       auto clean_line = line.chomp();
       uint h = typeid(string).getHash(&clean_line);
       if (isFlagSet(seen, h))
           // not used std.intrinsics, because they segfault here
           setFlag(seen_again, h);
       else
           setFlag(seen, h);
   }
   free(seen);

   int[string] aa;
   int horizon, nline;
   foreach (string line; new BufferedFile("data.txt")) {
       auto clean_line = line.chomp();
       if (isFlagSet(seen_again, typeid(string).getHash(&clean_line))) {
           auto pos_ptr = clean_line in aa;
           if (pos_ptr) {
               if ((nline - *pos_ptr) > horizon)
                    horizon = nline - *pos_ptr;
           } else
               aa[clean_line.dup] = nline;
       }
       nline++;
   }

   writefln(horizon);
}


The worst case is like this:
A
B
C
D
E
E
D
C
B
A

In this case the aa associative array has to store half of the lines of the file, and it be too much.

As you can see with this little program I have found another bug in Phobos, the intrinsics in this case don't work (and I have improved the Record of my libs too, used in another version of this code).

Bye,
bearophile
Top | Discussion index | About this forum | D home