June 22, 2010
== Quote from Rory McGuire (rmcguire@neonova.co.za)'s article
> On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk> wrote:
> > Here's the latest with naming convention (hopefully) followed. I've
> > implemented my
> > own squeeze() function and used sizeof in the memmove calls.
> >
> > How can I specify wide strings for the literals?
> >
> > Thanks,
> >
> > Ben
> >
> > module main;
> >
> > import std.algorithm;
> > import std.array;
> > import std.c.string;
> > import std.string;
> >
> > import std.stdio;
> >
> > template regex(CharT)
> > {
> > struct BasicStringToken
> > {
> >     bool negated = false;
> >     CharT[] charset;
> >     enum size_t MAX_CHARS = CharT.max + 1;
> >     enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
> >
> >     this(const bool negated_, ref CharT[] charset_)
> >     {
> >         negated = negated_;
> >         charset = charset_;
> >     }
> >
> >     void removeDuplicates()
> >     {
> >         charset.sort;
> >         squeeze(charset);
> >     }
> >
> >     void normalise()
> >     {
> >         if (charset.length == MAX_CHARS)
> >         {
> >             negated = !negated;
> >             charset.clear();
> >         }
> >         else if (charset.length > MAX_CHARS / 2)
> >         {
> >             negate();
> >         }
> >     }
> >
> >     void negate()
> >     {
> >         CharT curr_char = START_CHAR;
> >         CharT[] temp;
> >         CharT *ptr = cast(CharT *) 0;
> >         CharT *curr = charset.ptr;
> >         CharT *end = curr + charset.length;
> >         size_t i = 0;
> >
> >         negated = !negated;
> >         temp.length = MAX_CHARS - charset.length;
> >         ptr = temp.ptr;
> >
> >         while (curr < end)
> >         {
> >             while (*curr > curr_char)
> >             {
> >                 *ptr = curr_char;
> >                 ++ptr;
> >                 ++curr_char;
> >                 ++i;
> >             }
> >
> >             ++curr_char;
> >             ++curr;
> >             ++i;
> >         }
> >
> >         for (; i < MAX_CHARS; ++i)
> >         {
> >             *ptr = curr_char;
> >             ++ptr;
> >             ++curr_char;
> >         }
> >
> >         charset = temp;
> >     }
> >
> >     bool empty()
> >     {
> >         return charset.length == 0 && !negated;
> >     }
> >
> >     bool any()
> >     {
> >         return charset.length == 0 && negated;
> >     }
> >
> >     void clear()
> >     {
> >         negated = false;
> >         charset.length = 0;
> >     }
> >
> >     void intersect(ref BasicStringToken rhs,
> >         ref BasicStringToken overlap)
> >     {
> >         if ((any() && rhs.any()) || (negated == rhs.negated &&
> >             !any() && !rhs.any()))
> >         {
> >             intersectSameTypes(rhs, overlap);
> >         }
> >         else
> >         {
> >             intersectDiffTypes(rhs, overlap);
> >         }
> >     }
> >
> > private:
> >     void intersectSameTypes(ref BasicStringToken rhs,
> >         ref BasicStringToken overlap)
> >     {
> >         if (any())
> >         {
> >             clear();
> >             overlap.negated = true;
> >             rhs.clear();
> >         }
> >         else
> >         {
> >             CharT *iter = charset.ptr;
> >             CharT *end = iter + charset.length;
> >             CharT *rhs_iter = rhs.charset.ptr;
> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
> >
> >             overlap.negated = negated;
> >
> >             while (iter != end && rhs_iter != rhs_end)
> >             {
> >                 if (*iter < *rhs_iter)
> >                 {
> >                     ++iter;
> >                 }
> >                 else if (*iter > *rhs_iter)
> >                 {
> >                     ++rhs_iter;
> >                 }
> >                 else
> >                 {
> >                     overlap.charset ~= *iter;
> >                     memmove(iter, iter + 1, (charset.ptr +
> >                         charset.length - iter) * CharT.sizeof);
> >                     --end;
> >                     charset.length -= 1;
> >                     memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
> >                         rhs.charset.length - rhs_iter) * CharT.sizeof);
> >                     --rhs_end;
> >                     rhs.charset.length -= 1;
> >                 }
> >             }
> >
> >             if (negated)
> >             {
> >                 // duplicates already merged
> >                 // src, dest
> >                 merge(charset, overlap.charset);
> >                 // duplicates already merged
> >                 // src, dest
> >                 merge(rhs.charset, overlap.charset);
> >                 negated = false;
> >                 rhs.negated = false;
> >                 swap(charset, rhs.charset);
> >                 normalise();
> >                 overlap.normalise();
> >                 rhs.normalise();
> >             }
> >             else if (!overlap.charset.length == 0)
> >             {
> >                 normalise();
> >                 overlap.normalise();
> >                 rhs.normalise();
> >             }
> >         }
> >     }
> >
> >     void intersectDiffTypes(ref BasicStringToken rhs,
> >         ref BasicStringToken overlap)
> >     {
> >         if (any())
> >         {
> >             intersectAny(rhs, overlap);
> >         }
> >         else if (negated)
> >         {
> >             intersectNegated(rhs, overlap);
> >         }
> >         else // negated == false
> >         {
> >             intersectCharset(rhs, overlap);
> >         }
> >     }
> >
> >     void intersectAny(ref BasicStringToken rhs, ref BasicStringToken
> > overlap)
> >     {
> >         if (rhs.negated)
> >         {
> >             rhs.intersectNegated(this, overlap);
> >         }
> >         else // rhs.negated == false
> >         {
> >             rhs.intersectCharset(this, overlap);
> >         }
> >     }
> >
> >     void intersectNegated(ref BasicStringToken rhs,
> >         ref BasicStringToken overlap)
> >     {
> >         if (rhs.any())
> >         {
> >             overlap.negated = true;
> >             overlap.charset = charset;
> >             rhs.negated = false;
> >             rhs.charset = charset;
> >             clear();
> >         }
> >         else // rhs.negated == false
> >         {
> >             rhs.intersectCharset(this, overlap);
> >         }
> >     }
> >
> >     void intersectCharset(ref BasicStringToken rhs,
> >         ref BasicStringToken overlap)
> >     {
> >         if (rhs.any())
> >         {
> >             overlap.charset = charset;
> >             rhs.negated = true;
> >             rhs.charset = charset;
> >             clear();
> >         }
> >         else // rhs.negated == true
> >         {
> >             CharT *iter = charset.ptr;
> >             CharT *end = iter + charset.length;
> >             CharT *rhs_iter = rhs.charset.ptr;
> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
> >
> >             while (iter != end && rhs_iter != rhs_end)
> >             {
> >                 if (*iter < *rhs_iter)
> >                 {
> >                     overlap.charset ~= *iter;
> >                     rhs.charset.length += 1;
> >                     rhs_iter = rhs.charset.ptr;
> >                     rhs_end = rhs_iter + rhs.charset.length;
> >                     memmove(rhs_iter + 1, rhs_iter, (rhs.charset.length -
> >                         (rhs_end - rhs_iter - 1)) * CharT.sizeof);
> >                     ++rhs_iter;
> >                     memmove(iter, iter + 1, (charset.ptr +
> >                         charset.length - iter) * CharT.sizeof);
> >                     charset.length -= 1;
> >                     --end;
> >                 }
> >                 else if (*iter > *rhs_iter)
> >                 {
> >                     ++rhs_iter;
> >                 }
> >                 else
> >                 {
> >                     ++iter;
> >                     ++rhs_iter;
> >                 }
> >             }
> >
> >             if (iter != end)
> >             {
> >                 CharT[] temp;
> >
> >                 temp.length = end - iter;
> >                 memmove(temp.ptr, iter, temp.length * CharT.sizeof);
> >
> >                 // nothing bigger in rhs than iter
> >                 // src, dest
> >                 merge(temp, overlap.charset);
> >                 memmove(iter, iter + 1, (charset.ptr +
> >                     charset.length - iter) * CharT.sizeof);
> >                 charset.length -= 1;
> >             }
> >
> >             if (!overlap.charset.empty())
> >             {
> >                 merge(overlap.charset, rhs.charset);
> >                 // possible duplicates, so check for any and erase.
> >                 squeeze(rhs.charset);
> >                 normalise();
> >                 overlap.normalise();
> >                 rhs.normalise();
> >             }
> >         }
> >     }
> >
> >     void squeeze(ref CharT[] str)
> >     {
> >         if (str.length > 1)
> >         {
> >             CharT *write = str.ptr;
> >             CharT *end = write + str.length;
> >             CharT *read = write + 1;
> >
> >             while (read != end)
> >             {
> >                 while (read != end && *read == *write)
> >                 {
> >                     ++read;
> >                 }
> >
> >                 if (read == end) break;
> >
> >                 ++write;
> >
> >                 if (read > write)
> >                 {
> >                     *write = *read;
> >                 }
> >
> >                 ++read;
> >             }
> >
> >             str.length = write + 1 - str.ptr;
> >         }
> >     }
> >
> >     void merge(ref CharT[] src, ref CharT[] dest)
> >     {
> >         CharT[] temp;
> >         CharT *ptr;
> >         CharT *iter = src.ptr;
> >         CharT *end = iter + src.length;
> >         CharT *dest_iter = dest.ptr;
> >         CharT *dest_end = dest_iter + dest.length;
> >
> >         temp.length = src.length + dest.length;
> >         ptr = temp.ptr;
> >
> >         while (iter != end && dest_iter != dest_end)
> >         {
> >             if (*iter < *dest_iter)
> >             {
> >                 *ptr++ = *iter++;
> >             }
> >             else
> >             {
> >                 *ptr++ = *dest_iter++;
> >             }
> >         }
> >
> >         while (iter != end)
> >         {
> >             *ptr++ = *iter++;
> >         }
> >
> >         while (dest_iter != dest_end)
> >         {
> >             *ptr++ = *dest_iter++;
> >         }
> >
> >         dest = temp;
> >     }
> > };
> > }
> >
> > int main(char[][]argv)
> > {
> >     regex!(char).BasicStringToken lhs;
> >     regex!(char).BasicStringToken rhs;
> >     regex!(char).BasicStringToken intersect;
> >
> >     lhs.charset = "aaabbc".dup;
> >     lhs.negated = true;
> >     lhs.removeDuplicates();
> >     rhs.charset = "bccddd".dup;
> >     rhs.negated = true;
> >     rhs.removeDuplicates();
> >     writeln(lhs.charset, '(', lhs.negated, ") intersect ",
> >         rhs.charset, '(', rhs.negated, ") =");
> >     lhs.intersect(rhs, intersect);
> >     writeln(lhs.charset, '(', lhs.negated, "), ",
> >         rhs.charset, '(', rhs.negated, "), ",
> >         intersect.charset, '(', intersect.negated, ')');
> >     return 0;
> > }

> "the string"w
> gives you 16bit I believe. postfix with a 'd' should give you 32bit.

Thanks. The problem now is that sort() corrupts the strings. Does anyone know why?

Regards,

Ben
June 22, 2010
On Tue, 22 Jun 2010 15:31:14 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk> wrote:

> == Quote from Rory McGuire (rmcguire@neonova.co.za)'s article
>> On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk>
>> wrote:
>> > Here's the latest with naming convention (hopefully) followed. I've
>> > implemented my
>> > own squeeze() function and used sizeof in the memmove calls.
>> >
>> > How can I specify wide strings for the literals?
>> >
>> > Thanks,
>> >
>> > Ben
>> >
>> > module main;
>> >
>> > import std.algorithm;
>> > import std.array;
>> > import std.c.string;
>> > import std.string;
>> >
>> > import std.stdio;
>> >
>> > template regex(CharT)
>> > {
>> > struct BasicStringToken
>> > {
>> >     bool negated = false;
>> >     CharT[] charset;
>> >     enum size_t MAX_CHARS = CharT.max + 1;
>> >     enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>> >
>> >     this(const bool negated_, ref CharT[] charset_)
>> >     {
>> >         negated = negated_;
>> >         charset = charset_;
>> >     }
>> >
>> >     void removeDuplicates()
>> >     {
>> >         charset.sort;
>> >         squeeze(charset);
>> >     }
>> >
>> >     void normalise()
>> >     {
>> >         if (charset.length == MAX_CHARS)
>> >         {
>> >             negated = !negated;
>> >             charset.clear();
>> >         }
>> >         else if (charset.length > MAX_CHARS / 2)
>> >         {
>> >             negate();
>> >         }
>> >     }
>> >
>> >     void negate()
>> >     {
>> >         CharT curr_char = START_CHAR;
>> >         CharT[] temp;
>> >         CharT *ptr = cast(CharT *) 0;
>> >         CharT *curr = charset.ptr;
>> >         CharT *end = curr + charset.length;
>> >         size_t i = 0;
>> >
>> >         negated = !negated;
>> >         temp.length = MAX_CHARS - charset.length;
>> >         ptr = temp.ptr;
>> >
>> >         while (curr < end)
>> >         {
>> >             while (*curr > curr_char)
>> >             {
>> >                 *ptr = curr_char;
>> >                 ++ptr;
>> >                 ++curr_char;
>> >                 ++i;
>> >             }
>> >
>> >             ++curr_char;
>> >             ++curr;
>> >             ++i;
>> >         }
>> >
>> >         for (; i < MAX_CHARS; ++i)
>> >         {
>> >             *ptr = curr_char;
>> >             ++ptr;
>> >             ++curr_char;
>> >         }
>> >
>> >         charset = temp;
>> >     }
>> >
>> >     bool empty()
>> >     {
>> >         return charset.length == 0 && !negated;
>> >     }
>> >
>> >     bool any()
>> >     {
>> >         return charset.length == 0 && negated;
>> >     }
>> >
>> >     void clear()
>> >     {
>> >         negated = false;
>> >         charset.length = 0;
>> >     }
>> >
>> >     void intersect(ref BasicStringToken rhs,
>> >         ref BasicStringToken overlap)
>> >     {
>> >         if ((any() && rhs.any()) || (negated == rhs.negated &&
>> >             !any() && !rhs.any()))
>> >         {
>> >             intersectSameTypes(rhs, overlap);
>> >         }
>> >         else
>> >         {
>> >             intersectDiffTypes(rhs, overlap);
>> >         }
>> >     }
>> >
>> > private:
>> >     void intersectSameTypes(ref BasicStringToken rhs,
>> >         ref BasicStringToken overlap)
>> >     {
>> >         if (any())
>> >         {
>> >             clear();
>> >             overlap.negated = true;
>> >             rhs.clear();
>> >         }
>> >         else
>> >         {
>> >             CharT *iter = charset.ptr;
>> >             CharT *end = iter + charset.length;
>> >             CharT *rhs_iter = rhs.charset.ptr;
>> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
>> >
>> >             overlap.negated = negated;
>> >
>> >             while (iter != end && rhs_iter != rhs_end)
>> >             {
>> >                 if (*iter < *rhs_iter)
>> >                 {
>> >                     ++iter;
>> >                 }
>> >                 else if (*iter > *rhs_iter)
>> >                 {
>> >                     ++rhs_iter;
>> >                 }
>> >                 else
>> >                 {
>> >                     overlap.charset ~= *iter;
>> >                     memmove(iter, iter + 1, (charset.ptr +
>> >                         charset.length - iter) * CharT.sizeof);
>> >                     --end;
>> >                     charset.length -= 1;
>> >                     memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
>> >                         rhs.charset.length - rhs_iter) *  
>> CharT.sizeof);
>> >                     --rhs_end;
>> >                     rhs.charset.length -= 1;
>> >                 }
>> >             }
>> >
>> >             if (negated)
>> >             {
>> >                 // duplicates already merged
>> >                 // src, dest
>> >                 merge(charset, overlap.charset);
>> >                 // duplicates already merged
>> >                 // src, dest
>> >                 merge(rhs.charset, overlap.charset);
>> >                 negated = false;
>> >                 rhs.negated = false;
>> >                 swap(charset, rhs.charset);
>> >                 normalise();
>> >                 overlap.normalise();
>> >                 rhs.normalise();
>> >             }
>> >             else if (!overlap.charset.length == 0)
>> >             {
>> >                 normalise();
>> >                 overlap.normalise();
>> >                 rhs.normalise();
>> >             }
>> >         }
>> >     }
>> >
>> >     void intersectDiffTypes(ref BasicStringToken rhs,
>> >         ref BasicStringToken overlap)
>> >     {
>> >         if (any())
>> >         {
>> >             intersectAny(rhs, overlap);
>> >         }
>> >         else if (negated)
>> >         {
>> >             intersectNegated(rhs, overlap);
>> >         }
>> >         else // negated == false
>> >         {
>> >             intersectCharset(rhs, overlap);
>> >         }
>> >     }
>> >
>> >     void intersectAny(ref BasicStringToken rhs, ref BasicStringToken
>> > overlap)
>> >     {
>> >         if (rhs.negated)
>> >         {
>> >             rhs.intersectNegated(this, overlap);
>> >         }
>> >         else // rhs.negated == false
>> >         {
>> >             rhs.intersectCharset(this, overlap);
>> >         }
>> >     }
>> >
>> >     void intersectNegated(ref BasicStringToken rhs,
>> >         ref BasicStringToken overlap)
>> >     {
>> >         if (rhs.any())
>> >         {
>> >             overlap.negated = true;
>> >             overlap.charset = charset;
>> >             rhs.negated = false;
>> >             rhs.charset = charset;
>> >             clear();
>> >         }
>> >         else // rhs.negated == false
>> >         {
>> >             rhs.intersectCharset(this, overlap);
>> >         }
>> >     }
>> >
>> >     void intersectCharset(ref BasicStringToken rhs,
>> >         ref BasicStringToken overlap)
>> >     {
>> >         if (rhs.any())
>> >         {
>> >             overlap.charset = charset;
>> >             rhs.negated = true;
>> >             rhs.charset = charset;
>> >             clear();
>> >         }
>> >         else // rhs.negated == true
>> >         {
>> >             CharT *iter = charset.ptr;
>> >             CharT *end = iter + charset.length;
>> >             CharT *rhs_iter = rhs.charset.ptr;
>> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
>> >
>> >             while (iter != end && rhs_iter != rhs_end)
>> >             {
>> >                 if (*iter < *rhs_iter)
>> >                 {
>> >                     overlap.charset ~= *iter;
>> >                     rhs.charset.length += 1;
>> >                     rhs_iter = rhs.charset.ptr;
>> >                     rhs_end = rhs_iter + rhs.charset.length;
>> >                     memmove(rhs_iter + 1, rhs_iter,  
>> (rhs.charset.length -
>> >                         (rhs_end - rhs_iter - 1)) * CharT.sizeof);
>> >                     ++rhs_iter;
>> >                     memmove(iter, iter + 1, (charset.ptr +
>> >                         charset.length - iter) * CharT.sizeof);
>> >                     charset.length -= 1;
>> >                     --end;
>> >                 }
>> >                 else if (*iter > *rhs_iter)
>> >                 {
>> >                     ++rhs_iter;
>> >                 }
>> >                 else
>> >                 {
>> >                     ++iter;
>> >                     ++rhs_iter;
>> >                 }
>> >             }
>> >
>> >             if (iter != end)
>> >             {
>> >                 CharT[] temp;
>> >
>> >                 temp.length = end - iter;
>> >                 memmove(temp.ptr, iter, temp.length * CharT.sizeof);
>> >
>> >                 // nothing bigger in rhs than iter
>> >                 // src, dest
>> >                 merge(temp, overlap.charset);
>> >                 memmove(iter, iter + 1, (charset.ptr +
>> >                     charset.length - iter) * CharT.sizeof);
>> >                 charset.length -= 1;
>> >             }
>> >
>> >             if (!overlap.charset.empty())
>> >             {
>> >                 merge(overlap.charset, rhs.charset);
>> >                 // possible duplicates, so check for any and erase.
>> >                 squeeze(rhs.charset);
>> >                 normalise();
>> >                 overlap.normalise();
>> >                 rhs.normalise();
>> >             }
>> >         }
>> >     }
>> >
>> >     void squeeze(ref CharT[] str)
>> >     {
>> >         if (str.length > 1)
>> >         {
>> >             CharT *write = str.ptr;
>> >             CharT *end = write + str.length;
>> >             CharT *read = write + 1;
>> >
>> >             while (read != end)
>> >             {
>> >                 while (read != end && *read == *write)
>> >                 {
>> >                     ++read;
>> >                 }
>> >
>> >                 if (read == end) break;
>> >
>> >                 ++write;
>> >
>> >                 if (read > write)
>> >                 {
>> >                     *write = *read;
>> >                 }
>> >
>> >                 ++read;
>> >             }
>> >
>> >             str.length = write + 1 - str.ptr;
>> >         }
>> >     }
>> >
>> >     void merge(ref CharT[] src, ref CharT[] dest)
>> >     {
>> >         CharT[] temp;
>> >         CharT *ptr;
>> >         CharT *iter = src.ptr;
>> >         CharT *end = iter + src.length;
>> >         CharT *dest_iter = dest.ptr;
>> >         CharT *dest_end = dest_iter + dest.length;
>> >
>> >         temp.length = src.length + dest.length;
>> >         ptr = temp.ptr;
>> >
>> >         while (iter != end && dest_iter != dest_end)
>> >         {
>> >             if (*iter < *dest_iter)
>> >             {
>> >                 *ptr++ = *iter++;
>> >             }
>> >             else
>> >             {
>> >                 *ptr++ = *dest_iter++;
>> >             }
>> >         }
>> >
>> >         while (iter != end)
>> >         {
>> >             *ptr++ = *iter++;
>> >         }
>> >
>> >         while (dest_iter != dest_end)
>> >         {
>> >             *ptr++ = *dest_iter++;
>> >         }
>> >
>> >         dest = temp;
>> >     }
>> > };
>> > }
>> >
>> > int main(char[][]argv)
>> > {
>> >     regex!(char).BasicStringToken lhs;
>> >     regex!(char).BasicStringToken rhs;
>> >     regex!(char).BasicStringToken intersect;
>> >
>> >     lhs.charset = "aaabbc".dup;
>> >     lhs.negated = true;
>> >     lhs.removeDuplicates();
>> >     rhs.charset = "bccddd".dup;
>> >     rhs.negated = true;
>> >     rhs.removeDuplicates();
>> >     writeln(lhs.charset, '(', lhs.negated, ") intersect ",
>> >         rhs.charset, '(', rhs.negated, ") =");
>> >     lhs.intersect(rhs, intersect);
>> >     writeln(lhs.charset, '(', lhs.negated, "), ",
>> >         rhs.charset, '(', rhs.negated, "), ",
>> >         intersect.charset, '(', intersect.negated, ')');
>> >     return 0;
>> > }
>
>> "the string"w
>> gives you 16bit I believe. postfix with a 'd' should give you 32bit.
>
> Thanks. The problem now is that sort() corrupts the strings. Does anyone know why?
>
> Regards,
>
> Ben

perhaps from mixing wide chars with CharT if CharT is 8bits?
Honestly I havn't read your code but that is just the likely scenario.
June 22, 2010
== Quote from Rory McGuire (rmcguire@neonova.co.za)'s article
> On Tue, 22 Jun 2010 15:31:14 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk> wrote:
> > == Quote from Rory McGuire (rmcguire@neonova.co.za)'s article
> >> On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk> wrote:
> >> > Here's the latest with naming convention (hopefully) followed. I've
> >> > implemented my
> >> > own squeeze() function and used sizeof in the memmove calls.
> >> >
> >> > How can I specify wide strings for the literals?
> >> >
> >> > Thanks,
> >> >
> >> > Ben
> >> >
> >> > module main;
> >> >
> >> > import std.algorithm;
> >> > import std.array;
> >> > import std.c.string;
> >> > import std.string;
> >> >
> >> > import std.stdio;
> >> >
> >> > template regex(CharT)
> >> > {
> >> > struct BasicStringToken
> >> > {
> >> >     bool negated = false;
> >> >     CharT[] charset;
> >> >     enum size_t MAX_CHARS = CharT.max + 1;
> >> >     enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
> >> >
> >> >     this(const bool negated_, ref CharT[] charset_)
> >> >     {
> >> >         negated = negated_;
> >> >         charset = charset_;
> >> >     }
> >> >
> >> >     void removeDuplicates()
> >> >     {
> >> >         charset.sort;
> >> >         squeeze(charset);
> >> >     }
> >> >
> >> >     void normalise()
> >> >     {
> >> >         if (charset.length == MAX_CHARS)
> >> >         {
> >> >             negated = !negated;
> >> >             charset.clear();
> >> >         }
> >> >         else if (charset.length > MAX_CHARS / 2)
> >> >         {
> >> >             negate();
> >> >         }
> >> >     }
> >> >
> >> >     void negate()
> >> >     {
> >> >         CharT curr_char = START_CHAR;
> >> >         CharT[] temp;
> >> >         CharT *ptr = cast(CharT *) 0;
> >> >         CharT *curr = charset.ptr;
> >> >         CharT *end = curr + charset.length;
> >> >         size_t i = 0;
> >> >
> >> >         negated = !negated;
> >> >         temp.length = MAX_CHARS - charset.length;
> >> >         ptr = temp.ptr;
> >> >
> >> >         while (curr < end)
> >> >         {
> >> >             while (*curr > curr_char)
> >> >             {
> >> >                 *ptr = curr_char;
> >> >                 ++ptr;
> >> >                 ++curr_char;
> >> >                 ++i;
> >> >             }
> >> >
> >> >             ++curr_char;
> >> >             ++curr;
> >> >             ++i;
> >> >         }
> >> >
> >> >         for (; i < MAX_CHARS; ++i)
> >> >         {
> >> >             *ptr = curr_char;
> >> >             ++ptr;
> >> >             ++curr_char;
> >> >         }
> >> >
> >> >         charset = temp;
> >> >     }
> >> >
> >> >     bool empty()
> >> >     {
> >> >         return charset.length == 0 && !negated;
> >> >     }
> >> >
> >> >     bool any()
> >> >     {
> >> >         return charset.length == 0 && negated;
> >> >     }
> >> >
> >> >     void clear()
> >> >     {
> >> >         negated = false;
> >> >         charset.length = 0;
> >> >     }
> >> >
> >> >     void intersect(ref BasicStringToken rhs,
> >> >         ref BasicStringToken overlap)
> >> >     {
> >> >         if ((any() && rhs.any()) || (negated == rhs.negated &&
> >> >             !any() && !rhs.any()))
> >> >         {
> >> >             intersectSameTypes(rhs, overlap);
> >> >         }
> >> >         else
> >> >         {
> >> >             intersectDiffTypes(rhs, overlap);
> >> >         }
> >> >     }
> >> >
> >> > private:
> >> >     void intersectSameTypes(ref BasicStringToken rhs,
> >> >         ref BasicStringToken overlap)
> >> >     {
> >> >         if (any())
> >> >         {
> >> >             clear();
> >> >             overlap.negated = true;
> >> >             rhs.clear();
> >> >         }
> >> >         else
> >> >         {
> >> >             CharT *iter = charset.ptr;
> >> >             CharT *end = iter + charset.length;
> >> >             CharT *rhs_iter = rhs.charset.ptr;
> >> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
> >> >
> >> >             overlap.negated = negated;
> >> >
> >> >             while (iter != end && rhs_iter != rhs_end)
> >> >             {
> >> >                 if (*iter < *rhs_iter)
> >> >                 {
> >> >                     ++iter;
> >> >                 }
> >> >                 else if (*iter > *rhs_iter)
> >> >                 {
> >> >                     ++rhs_iter;
> >> >                 }
> >> >                 else
> >> >                 {
> >> >                     overlap.charset ~= *iter;
> >> >                     memmove(iter, iter + 1, (charset.ptr +
> >> >                         charset.length - iter) * CharT.sizeof);
> >> >                     --end;
> >> >                     charset.length -= 1;
> >> >                     memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
> >> >                         rhs.charset.length - rhs_iter) *
> >> CharT.sizeof);
> >> >                     --rhs_end;
> >> >                     rhs.charset.length -= 1;
> >> >                 }
> >> >             }
> >> >
> >> >             if (negated)
> >> >             {
> >> >                 // duplicates already merged
> >> >                 // src, dest
> >> >                 merge(charset, overlap.charset);
> >> >                 // duplicates already merged
> >> >                 // src, dest
> >> >                 merge(rhs.charset, overlap.charset);
> >> >                 negated = false;
> >> >                 rhs.negated = false;
> >> >                 swap(charset, rhs.charset);
> >> >                 normalise();
> >> >                 overlap.normalise();
> >> >                 rhs.normalise();
> >> >             }
> >> >             else if (!overlap.charset.length == 0)
> >> >             {
> >> >                 normalise();
> >> >                 overlap.normalise();
> >> >                 rhs.normalise();
> >> >             }
> >> >         }
> >> >     }
> >> >
> >> >     void intersectDiffTypes(ref BasicStringToken rhs,
> >> >         ref BasicStringToken overlap)
> >> >     {
> >> >         if (any())
> >> >         {
> >> >             intersectAny(rhs, overlap);
> >> >         }
> >> >         else if (negated)
> >> >         {
> >> >             intersectNegated(rhs, overlap);
> >> >         }
> >> >         else // negated == false
> >> >         {
> >> >             intersectCharset(rhs, overlap);
> >> >         }
> >> >     }
> >> >
> >> >     void intersectAny(ref BasicStringToken rhs, ref BasicStringToken
> >> > overlap)
> >> >     {
> >> >         if (rhs.negated)
> >> >         {
> >> >             rhs.intersectNegated(this, overlap);
> >> >         }
> >> >         else // rhs.negated == false
> >> >         {
> >> >             rhs.intersectCharset(this, overlap);
> >> >         }
> >> >     }
> >> >
> >> >     void intersectNegated(ref BasicStringToken rhs,
> >> >         ref BasicStringToken overlap)
> >> >     {
> >> >         if (rhs.any())
> >> >         {
> >> >             overlap.negated = true;
> >> >             overlap.charset = charset;
> >> >             rhs.negated = false;
> >> >             rhs.charset = charset;
> >> >             clear();
> >> >         }
> >> >         else // rhs.negated == false
> >> >         {
> >> >             rhs.intersectCharset(this, overlap);
> >> >         }
> >> >     }
> >> >
> >> >     void intersectCharset(ref BasicStringToken rhs,
> >> >         ref BasicStringToken overlap)
> >> >     {
> >> >         if (rhs.any())
> >> >         {
> >> >             overlap.charset = charset;
> >> >             rhs.negated = true;
> >> >             rhs.charset = charset;
> >> >             clear();
> >> >         }
> >> >         else // rhs.negated == true
> >> >         {
> >> >             CharT *iter = charset.ptr;
> >> >             CharT *end = iter + charset.length;
> >> >             CharT *rhs_iter = rhs.charset.ptr;
> >> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
> >> >
> >> >             while (iter != end && rhs_iter != rhs_end)
> >> >             {
> >> >                 if (*iter < *rhs_iter)
> >> >                 {
> >> >                     overlap.charset ~= *iter;
> >> >                     rhs.charset.length += 1;
> >> >                     rhs_iter = rhs.charset.ptr;
> >> >                     rhs_end = rhs_iter + rhs.charset.length;
> >> >                     memmove(rhs_iter + 1, rhs_iter,
> >> (rhs.charset.length -
> >> >                         (rhs_end - rhs_iter - 1)) * CharT.sizeof);
> >> >                     ++rhs_iter;
> >> >                     memmove(iter, iter + 1, (charset.ptr +
> >> >                         charset.length - iter) * CharT.sizeof);
> >> >                     charset.length -= 1;
> >> >                     --end;
> >> >                 }
> >> >                 else if (*iter > *rhs_iter)
> >> >                 {
> >> >                     ++rhs_iter;
> >> >                 }
> >> >                 else
> >> >                 {
> >> >                     ++iter;
> >> >                     ++rhs_iter;
> >> >                 }
> >> >             }
> >> >
> >> >             if (iter != end)
> >> >             {
> >> >                 CharT[] temp;
> >> >
> >> >                 temp.length = end - iter;
> >> >                 memmove(temp.ptr, iter, temp.length * CharT.sizeof);
> >> >
> >> >                 // nothing bigger in rhs than iter
> >> >                 // src, dest
> >> >                 merge(temp, overlap.charset);
> >> >                 memmove(iter, iter + 1, (charset.ptr +
> >> >                     charset.length - iter) * CharT.sizeof);
> >> >                 charset.length -= 1;
> >> >             }
> >> >
> >> >             if (!overlap.charset.empty())
> >> >             {
> >> >                 merge(overlap.charset, rhs.charset);
> >> >                 // possible duplicates, so check for any and erase.
> >> >                 squeeze(rhs.charset);
> >> >                 normalise();
> >> >                 overlap.normalise();
> >> >                 rhs.normalise();
> >> >             }
> >> >         }
> >> >     }
> >> >
> >> >     void squeeze(ref CharT[] str)
> >> >     {
> >> >         if (str.length > 1)
> >> >         {
> >> >             CharT *write = str.ptr;
> >> >             CharT *end = write + str.length;
> >> >             CharT *read = write + 1;
> >> >
> >> >             while (read != end)
> >> >             {
> >> >                 while (read != end && *read == *write)
> >> >                 {
> >> >                     ++read;
> >> >                 }
> >> >
> >> >                 if (read == end) break;
> >> >
> >> >                 ++write;
> >> >
> >> >                 if (read > write)
> >> >                 {
> >> >                     *write = *read;
> >> >                 }
> >> >
> >> >                 ++read;
> >> >             }
> >> >
> >> >             str.length = write + 1 - str.ptr;
> >> >         }
> >> >     }
> >> >
> >> >     void merge(ref CharT[] src, ref CharT[] dest)
> >> >     {
> >> >         CharT[] temp;
> >> >         CharT *ptr;
> >> >         CharT *iter = src.ptr;
> >> >         CharT *end = iter + src.length;
> >> >         CharT *dest_iter = dest.ptr;
> >> >         CharT *dest_end = dest_iter + dest.length;
> >> >
> >> >         temp.length = src.length + dest.length;
> >> >         ptr = temp.ptr;
> >> >
> >> >         while (iter != end && dest_iter != dest_end)
> >> >         {
> >> >             if (*iter < *dest_iter)
> >> >             {
> >> >                 *ptr++ = *iter++;
> >> >             }
> >> >             else
> >> >             {
> >> >                 *ptr++ = *dest_iter++;
> >> >             }
> >> >         }
> >> >
> >> >         while (iter != end)
> >> >         {
> >> >             *ptr++ = *iter++;
> >> >         }
> >> >
> >> >         while (dest_iter != dest_end)
> >> >         {
> >> >             *ptr++ = *dest_iter++;
> >> >         }
> >> >
> >> >         dest = temp;
> >> >     }
> >> > };
> >> > }
> >> >
> >> > int main(char[][]argv)
> >> > {
> >> >     regex!(char).BasicStringToken lhs;
> >> >     regex!(char).BasicStringToken rhs;
> >> >     regex!(char).BasicStringToken intersect;
> >> >
> >> >     lhs.charset = "aaabbc".dup;
> >> >     lhs.negated = true;
> >> >     lhs.removeDuplicates();
> >> >     rhs.charset = "bccddd".dup;
> >> >     rhs.negated = true;
> >> >     rhs.removeDuplicates();
> >> >     writeln(lhs.charset, '(', lhs.negated, ") intersect ",
> >> >         rhs.charset, '(', rhs.negated, ") =");
> >> >     lhs.intersect(rhs, intersect);
> >> >     writeln(lhs.charset, '(', lhs.negated, "), ",
> >> >         rhs.charset, '(', rhs.negated, "), ",
> >> >         intersect.charset, '(', intersect.negated, ')');
> >> >     return 0;
> >> > }
> >
> >> "the string"w
> >> gives you 16bit I believe. postfix with a 'd' should give you 32bit.
> >
> > Thanks. The problem now is that sort() corrupts the strings. Does anyone
> > know why?
> >
> > Regards,
> >
> > Ben
> perhaps from mixing wide chars with CharT if CharT is 8bits?
> Honestly I havn't read your code but that is just the likely scenario.

I don't think so:

int main(char[][]argv)
{
    regex!(wchar).BasicStringToken lhs;
    regex!(wchar).BasicStringToken rhs;
    regex!(wchar).BasicStringToken intersect;

    lhs.charset = "aaabbc"w.dup;
    lhs.negated = true;
    lhs.removeDuplicates();
    rhs.charset = "bccddd"w.dup;
    rhs.negated = true;
    rhs.removeDuplicates();
    writeln(lhs.charset, '(', lhs.negated, ") intersect ",
        rhs.charset, '(', rhs.negated, ") =");
    lhs.intersect(rhs, intersect);
    writeln(lhs.charset, '(', lhs.negated, "), ",
        rhs.charset, '(', rhs.negated, "), ",
        intersect.charset, '(', intersect.negated, ')');
    return 0;
}
June 22, 2010
On Tue, 22 Jun 2010 16:37:38 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk> wrote:

> == Quote from Rory McGuire (rmcguire@neonova.co.za)'s article
>> On Tue, 22 Jun 2010 15:31:14 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk>
>> wrote:
>> > == Quote from Rory McGuire (rmcguire@neonova.co.za)'s article
>> >> On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson  
>> <Ben.Hanson@tfbplc.co.uk>
>> >> wrote:
>> >> > Here's the latest with naming convention (hopefully) followed. I've
>> >> > implemented my
>> >> > own squeeze() function and used sizeof in the memmove calls.
>> >> >
>> >> > How can I specify wide strings for the literals?
>> >> >
>> >> > Thanks,
>> >> >
>> >> > Ben
>> >> >
>> >> > module main;
>> >> >
>> >> > import std.algorithm;
>> >> > import std.array;
>> >> > import std.c.string;
>> >> > import std.string;
>> >> >
>> >> > import std.stdio;
>> >> >
>> >> > template regex(CharT)
>> >> > {
>> >> > struct BasicStringToken
>> >> > {
>> >> >     bool negated = false;
>> >> >     CharT[] charset;
>> >> >     enum size_t MAX_CHARS = CharT.max + 1;
>> >> >     enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>> >> >
>> >> >     this(const bool negated_, ref CharT[] charset_)
>> >> >     {
>> >> >         negated = negated_;
>> >> >         charset = charset_;
>> >> >     }
>> >> >
>> >> >     void removeDuplicates()
>> >> >     {
>> >> >         charset.sort;
>> >> >         squeeze(charset);
>> >> >     }
>> >> >
>> >> >     void normalise()
>> >> >     {
>> >> >         if (charset.length == MAX_CHARS)
>> >> >         {
>> >> >             negated = !negated;
>> >> >             charset.clear();
>> >> >         }
>> >> >         else if (charset.length > MAX_CHARS / 2)
>> >> >         {
>> >> >             negate();
>> >> >         }
>> >> >     }
>> >> >
>> >> >     void negate()
>> >> >     {
>> >> >         CharT curr_char = START_CHAR;
>> >> >         CharT[] temp;
>> >> >         CharT *ptr = cast(CharT *) 0;
>> >> >         CharT *curr = charset.ptr;
>> >> >         CharT *end = curr + charset.length;
>> >> >         size_t i = 0;
>> >> >
>> >> >         negated = !negated;
>> >> >         temp.length = MAX_CHARS - charset.length;
>> >> >         ptr = temp.ptr;
>> >> >
>> >> >         while (curr < end)
>> >> >         {
>> >> >             while (*curr > curr_char)
>> >> >             {
>> >> >                 *ptr = curr_char;
>> >> >                 ++ptr;
>> >> >                 ++curr_char;
>> >> >                 ++i;
>> >> >             }
>> >> >
>> >> >             ++curr_char;
>> >> >             ++curr;
>> >> >             ++i;
>> >> >         }
>> >> >
>> >> >         for (; i < MAX_CHARS; ++i)
>> >> >         {
>> >> >             *ptr = curr_char;
>> >> >             ++ptr;
>> >> >             ++curr_char;
>> >> >         }
>> >> >
>> >> >         charset = temp;
>> >> >     }
>> >> >
>> >> >     bool empty()
>> >> >     {
>> >> >         return charset.length == 0 && !negated;
>> >> >     }
>> >> >
>> >> >     bool any()
>> >> >     {
>> >> >         return charset.length == 0 && negated;
>> >> >     }
>> >> >
>> >> >     void clear()
>> >> >     {
>> >> >         negated = false;
>> >> >         charset.length = 0;
>> >> >     }
>> >> >
>> >> >     void intersect(ref BasicStringToken rhs,
>> >> >         ref BasicStringToken overlap)
>> >> >     {
>> >> >         if ((any() && rhs.any()) || (negated == rhs.negated &&
>> >> >             !any() && !rhs.any()))
>> >> >         {
>> >> >             intersectSameTypes(rhs, overlap);
>> >> >         }
>> >> >         else
>> >> >         {
>> >> >             intersectDiffTypes(rhs, overlap);
>> >> >         }
>> >> >     }
>> >> >
>> >> > private:
>> >> >     void intersectSameTypes(ref BasicStringToken rhs,
>> >> >         ref BasicStringToken overlap)
>> >> >     {
>> >> >         if (any())
>> >> >         {
>> >> >             clear();
>> >> >             overlap.negated = true;
>> >> >             rhs.clear();
>> >> >         }
>> >> >         else
>> >> >         {
>> >> >             CharT *iter = charset.ptr;
>> >> >             CharT *end = iter + charset.length;
>> >> >             CharT *rhs_iter = rhs.charset.ptr;
>> >> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
>> >> >
>> >> >             overlap.negated = negated;
>> >> >
>> >> >             while (iter != end && rhs_iter != rhs_end)
>> >> >             {
>> >> >                 if (*iter < *rhs_iter)
>> >> >                 {
>> >> >                     ++iter;
>> >> >                 }
>> >> >                 else if (*iter > *rhs_iter)
>> >> >                 {
>> >> >                     ++rhs_iter;
>> >> >                 }
>> >> >                 else
>> >> >                 {
>> >> >                     overlap.charset ~= *iter;
>> >> >                     memmove(iter, iter + 1, (charset.ptr +
>> >> >                         charset.length - iter) * CharT.sizeof);
>> >> >                     --end;
>> >> >                     charset.length -= 1;
>> >> >                     memmove(rhs_iter, rhs_iter + 1,  
>> (rhs.charset.ptr +
>> >> >                         rhs.charset.length - rhs_iter) *
>> >> CharT.sizeof);
>> >> >                     --rhs_end;
>> >> >                     rhs.charset.length -= 1;
>> >> >                 }
>> >> >             }
>> >> >
>> >> >             if (negated)
>> >> >             {
>> >> >                 // duplicates already merged
>> >> >                 // src, dest
>> >> >                 merge(charset, overlap.charset);
>> >> >                 // duplicates already merged
>> >> >                 // src, dest
>> >> >                 merge(rhs.charset, overlap.charset);
>> >> >                 negated = false;
>> >> >                 rhs.negated = false;
>> >> >                 swap(charset, rhs.charset);
>> >> >                 normalise();
>> >> >                 overlap.normalise();
>> >> >                 rhs.normalise();
>> >> >             }
>> >> >             else if (!overlap.charset.length == 0)
>> >> >             {
>> >> >                 normalise();
>> >> >                 overlap.normalise();
>> >> >                 rhs.normalise();
>> >> >             }
>> >> >         }
>> >> >     }
>> >> >
>> >> >     void intersectDiffTypes(ref BasicStringToken rhs,
>> >> >         ref BasicStringToken overlap)
>> >> >     {
>> >> >         if (any())
>> >> >         {
>> >> >             intersectAny(rhs, overlap);
>> >> >         }
>> >> >         else if (negated)
>> >> >         {
>> >> >             intersectNegated(rhs, overlap);
>> >> >         }
>> >> >         else // negated == false
>> >> >         {
>> >> >             intersectCharset(rhs, overlap);
>> >> >         }
>> >> >     }
>> >> >
>> >> >     void intersectAny(ref BasicStringToken rhs, ref  
>> BasicStringToken
>> >> > overlap)
>> >> >     {
>> >> >         if (rhs.negated)
>> >> >         {
>> >> >             rhs.intersectNegated(this, overlap);
>> >> >         }
>> >> >         else // rhs.negated == false
>> >> >         {
>> >> >             rhs.intersectCharset(this, overlap);
>> >> >         }
>> >> >     }
>> >> >
>> >> >     void intersectNegated(ref BasicStringToken rhs,
>> >> >         ref BasicStringToken overlap)
>> >> >     {
>> >> >         if (rhs.any())
>> >> >         {
>> >> >             overlap.negated = true;
>> >> >             overlap.charset = charset;
>> >> >             rhs.negated = false;
>> >> >             rhs.charset = charset;
>> >> >             clear();
>> >> >         }
>> >> >         else // rhs.negated == false
>> >> >         {
>> >> >             rhs.intersectCharset(this, overlap);
>> >> >         }
>> >> >     }
>> >> >
>> >> >     void intersectCharset(ref BasicStringToken rhs,
>> >> >         ref BasicStringToken overlap)
>> >> >     {
>> >> >         if (rhs.any())
>> >> >         {
>> >> >             overlap.charset = charset;
>> >> >             rhs.negated = true;
>> >> >             rhs.charset = charset;
>> >> >             clear();
>> >> >         }
>> >> >         else // rhs.negated == true
>> >> >         {
>> >> >             CharT *iter = charset.ptr;
>> >> >             CharT *end = iter + charset.length;
>> >> >             CharT *rhs_iter = rhs.charset.ptr;
>> >> >             CharT *rhs_end = rhs_iter + rhs.charset.length;
>> >> >
>> >> >             while (iter != end && rhs_iter != rhs_end)
>> >> >             {
>> >> >                 if (*iter < *rhs_iter)
>> >> >                 {
>> >> >                     overlap.charset ~= *iter;
>> >> >                     rhs.charset.length += 1;
>> >> >                     rhs_iter = rhs.charset.ptr;
>> >> >                     rhs_end = rhs_iter + rhs.charset.length;
>> >> >                     memmove(rhs_iter + 1, rhs_iter,
>> >> (rhs.charset.length -
>> >> >                         (rhs_end - rhs_iter - 1)) * CharT.sizeof);
>> >> >                     ++rhs_iter;
>> >> >                     memmove(iter, iter + 1, (charset.ptr +
>> >> >                         charset.length - iter) * CharT.sizeof);
>> >> >                     charset.length -= 1;
>> >> >                     --end;
>> >> >                 }
>> >> >                 else if (*iter > *rhs_iter)
>> >> >                 {
>> >> >                     ++rhs_iter;
>> >> >                 }
>> >> >                 else
>> >> >                 {
>> >> >                     ++iter;
>> >> >                     ++rhs_iter;
>> >> >                 }
>> >> >             }
>> >> >
>> >> >             if (iter != end)
>> >> >             {
>> >> >                 CharT[] temp;
>> >> >
>> >> >                 temp.length = end - iter;
>> >> >                 memmove(temp.ptr, iter, temp.length *  
>> CharT.sizeof);
>> >> >
>> >> >                 // nothing bigger in rhs than iter
>> >> >                 // src, dest
>> >> >                 merge(temp, overlap.charset);
>> >> >                 memmove(iter, iter + 1, (charset.ptr +
>> >> >                     charset.length - iter) * CharT.sizeof);
>> >> >                 charset.length -= 1;
>> >> >             }
>> >> >
>> >> >             if (!overlap.charset.empty())
>> >> >             {
>> >> >                 merge(overlap.charset, rhs.charset);
>> >> >                 // possible duplicates, so check for any and erase.
>> >> >                 squeeze(rhs.charset);
>> >> >                 normalise();
>> >> >                 overlap.normalise();
>> >> >                 rhs.normalise();
>> >> >             }
>> >> >         }
>> >> >     }
>> >> >
>> >> >     void squeeze(ref CharT[] str)
>> >> >     {
>> >> >         if (str.length > 1)
>> >> >         {
>> >> >             CharT *write = str.ptr;
>> >> >             CharT *end = write + str.length;
>> >> >             CharT *read = write + 1;
>> >> >
>> >> >             while (read != end)
>> >> >             {
>> >> >                 while (read != end && *read == *write)
>> >> >                 {
>> >> >                     ++read;
>> >> >                 }
>> >> >
>> >> >                 if (read == end) break;
>> >> >
>> >> >                 ++write;
>> >> >
>> >> >                 if (read > write)
>> >> >                 {
>> >> >                     *write = *read;
>> >> >                 }
>> >> >
>> >> >                 ++read;
>> >> >             }
>> >> >
>> >> >             str.length = write + 1 - str.ptr;
>> >> >         }
>> >> >     }
>> >> >
>> >> >     void merge(ref CharT[] src, ref CharT[] dest)
>> >> >     {
>> >> >         CharT[] temp;
>> >> >         CharT *ptr;
>> >> >         CharT *iter = src.ptr;
>> >> >         CharT *end = iter + src.length;
>> >> >         CharT *dest_iter = dest.ptr;
>> >> >         CharT *dest_end = dest_iter + dest.length;
>> >> >
>> >> >         temp.length = src.length + dest.length;
>> >> >         ptr = temp.ptr;
>> >> >
>> >> >         while (iter != end && dest_iter != dest_end)
>> >> >         {
>> >> >             if (*iter < *dest_iter)
>> >> >             {
>> >> >                 *ptr++ = *iter++;
>> >> >             }
>> >> >             else
>> >> >             {
>> >> >                 *ptr++ = *dest_iter++;
>> >> >             }
>> >> >         }
>> >> >
>> >> >         while (iter != end)
>> >> >         {
>> >> >             *ptr++ = *iter++;
>> >> >         }
>> >> >
>> >> >         while (dest_iter != dest_end)
>> >> >         {
>> >> >             *ptr++ = *dest_iter++;
>> >> >         }
>> >> >
>> >> >         dest = temp;
>> >> >     }
>> >> > };
>> >> > }
>> >> >
>> >> > int main(char[][]argv)
>> >> > {
>> >> >     regex!(char).BasicStringToken lhs;
>> >> >     regex!(char).BasicStringToken rhs;
>> >> >     regex!(char).BasicStringToken intersect;
>> >> >
>> >> >     lhs.charset = "aaabbc".dup;
>> >> >     lhs.negated = true;
>> >> >     lhs.removeDuplicates();
>> >> >     rhs.charset = "bccddd".dup;
>> >> >     rhs.negated = true;
>> >> >     rhs.removeDuplicates();
>> >> >     writeln(lhs.charset, '(', lhs.negated, ") intersect ",
>> >> >         rhs.charset, '(', rhs.negated, ") =");
>> >> >     lhs.intersect(rhs, intersect);
>> >> >     writeln(lhs.charset, '(', lhs.negated, "), ",
>> >> >         rhs.charset, '(', rhs.negated, "), ",
>> >> >         intersect.charset, '(', intersect.negated, ')');
>> >> >     return 0;
>> >> > }
>> >
>> >> "the string"w
>> >> gives you 16bit I believe. postfix with a 'd' should give you 32bit.
>> >
>> > Thanks. The problem now is that sort() corrupts the strings. Does  
>> anyone
>> > know why?
>> >
>> > Regards,
>> >
>> > Ben
>> perhaps from mixing wide chars with CharT if CharT is 8bits?
>> Honestly I havn't read your code but that is just the likely scenario.
>
> I don't think so:
>
> int main(char[][]argv)
> {
>     regex!(wchar).BasicStringToken lhs;
>     regex!(wchar).BasicStringToken rhs;
>     regex!(wchar).BasicStringToken intersect;
>
>     lhs.charset = "aaabbc"w.dup;
>     lhs.negated = true;
>     lhs.removeDuplicates();
>     rhs.charset = "bccddd"w.dup;
>     rhs.negated = true;
>     rhs.removeDuplicates();
>     writeln(lhs.charset, '(', lhs.negated, ") intersect ",
>         rhs.charset, '(', rhs.negated, ") =");
>     lhs.intersect(rhs, intersect);
>     writeln(lhs.charset, '(', lhs.negated, "), ",
>         rhs.charset, '(', rhs.negated, "), ",
>         intersect.charset, '(', intersect.negated, ')');
>     return 0;
> }



hmm, that does seem strange, it seems to work with char and dchar but not wchar.

-Rory
June 22, 2010
On 06/22/2010 08:13 AM, Ben Hanson wrote:
> Here's the latest with naming convention (hopefully) followed. I've implemented my
> own squeeze() function and used sizeof in the memmove calls.

I suggest you to look into using the range primitives (empty, front, back, popFront, and popBack) with strings of any width. Your code assumes that all characters have the same width and therefore will behave erratically on UTF-8 and UTF-16 encodings.

In the particular case of squeeze(), you may want to use uniq instead, which works on any forward range and will therefore decode characters properly:

http://www.digitalmars.com/d/2.0/phobos/std_algorithm.html#uniq


Andrei
June 22, 2010
== Quote from Andrei Alexandrescu (SeeWebsiteForEmail@erdani.org)'s article
> On 06/22/2010 08:13 AM, Ben Hanson wrote:
> > Here's the latest with naming convention (hopefully) followed. I've implemented my
> > own squeeze() function and used sizeof in the memmove calls.
> I suggest you to look into using the range primitives (empty, front,
> back, popFront, and popBack) with strings of any width. Your code
> assumes that all characters have the same width and therefore will
> behave erratically on UTF-8 and UTF-16 encodings.
> In the particular case of squeeze(), you may want to use uniq instead,
> which works on any forward range and will therefore decode characters
> properly:
> http://www.digitalmars.com/d/2.0/phobos/std_algorithm.html#uniq
> Andrei

OK, thanks.

Don't forget these are regular expressions though. I was wondering whether people really want to pass regular expressions UTF encoded, but I suppose it could happen. It's certainly a good idea to get used to using UTF compatible functions anyway.

Is there is any support for Unicode continuation characters yet? Do you agree that (ideally) Unicode text should be normalised before searching?

Regards,

Ben
1 2
Next ›   Last »