| Thread overview | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
June 22, 2010 Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Hi there,
I've basically got the string_token class working now. Thanks to everyone who helped. It still needs some work as memmove works with bytes so I need the equivalent of 'sizeof' in D for this.
'squeeze' doesn't work with wide chars, so I will write my own version.
When I shrink or grow char arrays, I'd like to know if I should re-set my pointers into them accordingly.
If anyone can point out any other obvious issues, bad style etc. that would be appreciated. Please bear in mind that I'd like the code to be as fast as possible.
Here's the source:
Regards,
Ben
module main;
import std.algorithm;
import std.array;
import std.c.string;
import std.string;
import std.stdio;
template regex(CharT)
{
struct basic_string_token
{
bool _negated = false;
CharT[] _charset;
enum size_t MAX_CHARS = CharT.max + 1;
enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
this(const bool negated_, ref CharT[] charset_)
{
_negated = negated_;
_charset = charset_;
}
void remove_duplicates()
{
_charset.sort;
_charset = squeeze(_charset.idup).dup;
}
void normalise()
{
if (_charset.length == MAX_CHARS)
{
_negated = !_negated;
_charset.clear();
}
else if (_charset.length > MAX_CHARS / 2)
{
negate();
}
}
void negate()
{
CharT curr_char_ = START_CHAR;
CharT[] temp_;
CharT *ptr_;
CharT *curr_ = _charset.ptr;
CharT *end_ = curr_ + _charset.length;
size_t i_ = 0;
_negated = !_negated;
temp_.length = MAX_CHARS - _charset.length;
ptr_ = temp_.ptr;
while (curr_ < end_)
{
while (*curr_ > curr_char_)
{
*ptr_ = curr_char_;
++ptr_;
++curr_char_;
++i_;
}
++curr_char_;
++curr_;
++i_;
}
for (; i_ < MAX_CHARS; ++i_)
{
*ptr_ = curr_char_;
++ptr_;
++curr_char_;
}
_charset = temp_;
}
bool empty()
{
return _charset.length == 0 && !_negated;
}
bool any()
{
return _charset.length == 0 && _negated;
}
void clear()
{
_negated = false;
_charset.length = 0;
}
void intersect(ref basic_string_token rhs_,
ref basic_string_token overlap_)
{
if ((any() && rhs_.any()) || (_negated == rhs_._negated &&
!any() && !rhs_.any()))
{
intersect_same_types(rhs_, overlap_);
}
else
{
intersect_diff_types(rhs_, overlap_);
}
}
private:
void intersect_same_types(ref basic_string_token rhs_,
ref basic_string_token overlap_)
{
if (any())
{
clear();
overlap_._negated = true;
rhs_.clear();
}
else
{
CharT *iter_ = _charset.ptr;
CharT *end_ = iter_ + _charset.length;
CharT *rhs_iter_ = rhs_._charset.ptr;
CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
overlap_._negated = _negated;
while (iter_ != end_ && rhs_iter_ != rhs_end_)
{
if (*iter_ < *rhs_iter_)
{
++iter_;
}
else if (*iter_ > *rhs_iter_)
{
++rhs_iter_;
}
else
{
overlap_._charset ~= *iter_;
memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
--end_;
_charset.length -= 1;
memmove(rhs_iter_, rhs_iter_ + 1, rhs_._charset.ptr +
rhs_._charset.length - rhs_iter_);
--rhs_end_;
rhs_._charset.length -= 1;
}
}
if (_negated)
{
// duplicates already merged
// src, dest
merge(_charset, overlap_._charset);
// duplicates already merged
// src, dest
merge(rhs_._charset, overlap_._charset);
_negated = false;
rhs_._negated = false;
swap(_charset, rhs_._charset);
normalise();
overlap_.normalise();
rhs_.normalise();
}
else if (!overlap_._charset.length == 0)
{
normalise();
overlap_.normalise();
rhs_.normalise();
}
}
}
void intersect_diff_types(ref basic_string_token rhs_,
ref basic_string_token overlap_)
{
if (any())
{
intersect_any(rhs_, overlap_);
}
else if (_negated)
{
intersect_negated(rhs_, overlap_);
}
else // _negated == false
{
intersect_charset(rhs_, overlap_);
}
}
void intersect_any(ref basic_string_token rhs_, ref basic_string_token overlap_)
{
if (rhs_._negated)
{
rhs_.intersect_negated(this, overlap_);
}
else // rhs._negated == false
{
rhs_.intersect_charset(this, overlap_);
}
}
void intersect_negated(ref basic_string_token rhs_,
ref basic_string_token overlap_)
{
if (rhs_.any())
{
overlap_._negated = true;
overlap_._charset = _charset;
rhs_._negated = false;
rhs_._charset = _charset;
clear();
}
else // rhs._negated == false
{
rhs_.intersect_charset(this, overlap_);
}
}
void intersect_charset(ref basic_string_token rhs_,
ref basic_string_token overlap_)
{
if (rhs_.any())
{
overlap_._charset = _charset;
rhs_._negated = true;
rhs_._charset = _charset;
clear();
}
else // rhs_._negated == true
{
CharT *iter_ = _charset.ptr;
CharT *end_ = iter_ + _charset.length;
CharT *rhs_iter_ = rhs_._charset.ptr;
CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
while (iter_ != end_ && rhs_iter_ != rhs_end_)
{
if (*iter_ < *rhs_iter_)
{
overlap_._charset ~= *iter_;
rhs_._charset.length += 1;
rhs_iter_ = rhs_._charset.ptr;
rhs_end_ = rhs_iter_ + rhs_._charset.length;
memmove(rhs_iter_ + 1, rhs_iter_, rhs_._charset.length -
(rhs_end_ - rhs_iter_ - 1));
++rhs_iter_;
memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
_charset.length -= 1;
--end_;
}
else if (*iter_ > *rhs_iter_)
{
++rhs_iter_;
}
else
{
++iter_;
++rhs_iter_;
}
}
if (iter_ != end_)
{
CharT[] temp_;
temp_.length = end_ - iter_;
memmove(temp_.ptr, iter_, temp_.length);
// nothing bigger in rhs_ than iter_
// src, dest
merge(temp_, overlap_._charset);
memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
_charset.length -= 1;
}
if (!overlap_._charset.empty())
{
merge(overlap_._charset, rhs_._charset);
// possible duplicates, so check for any and erase.
rhs_._charset = squeeze(rhs_._charset.idup).dup;
normalise();
overlap_.normalise();
rhs_.normalise();
}
}
}
void merge(ref CharT[] src_, ref CharT[] dest_)
{
CharT[] temp_;
CharT *ptr_;
CharT *iter_ = src_.ptr;
CharT *end_ = iter_ + src_.length;
CharT *dest_iter_ = dest_.ptr;
CharT *dest_end_ = dest_iter_ + dest_.length;
temp_.length = src_.length + dest_.length;
ptr_ = temp_.ptr;
while (iter_ != end_ && dest_iter_ != dest_end_)
{
if (*iter_ < *dest_iter_)
{
*ptr_++ = *iter_++;
}
else
{
*ptr_++ = *dest_iter_++;
}
}
while (iter_ != end_)
{
*ptr_++ = *iter_++;
}
while (dest_iter_ != dest_end_)
{
*ptr_++ = *dest_iter_++;
}
dest_ = temp_;
}
};
}
int main(char[][]argv)
{
regex!(char).basic_string_token lhs_;
regex!(char).basic_string_token rhs_;
regex!(char).basic_string_token intersect_;
lhs_._charset = "abc".dup;
lhs_._negated = true;
rhs_._charset = "bcd".dup;
rhs_._negated = true;
writeln(lhs_._charset, '(', lhs_._negated, ") intersect ",
rhs_._charset, '(', rhs_._negated, ") =");
lhs_.intersect(rhs_, intersect_);
writeln(lhs_._charset, '(', lhs_._negated, "), ",
rhs_._charset, '(', rhs_._negated, "), ",
intersect_._charset, '(', intersect_._negated, ')');
return 0;
}
| ||||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Ben Hanson | Ben Hanson:
> It still needs some work as memmove works with bytes so I need the equivalent of 'sizeof' in D for this.
T.sizeof gives the size of the init of a variable of type T.
If T is a dynamic array it returns wordsSize*2, so if you need the item size you can write T[0].sizeof.
Why do you use so many underscores?
Bye,
bearophile
| |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Ben Hanson | I think sizeof is a property, e.g. char.sizeof or:
struct A {
int a;
char[10] b;
}
A.sizeof
A a;
a.sizeof
you get the picture. (Have I got it?)
-Rory
On Tue, 22 Jun 2010 12:07:37 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk> wrote:
> Hi there,
>
> I've basically got the string_token class working now. Thanks to everyone who
> helped. It still needs some work as memmove works with bytes so I need the
> equivalent of 'sizeof' in D for this.
>
> 'squeeze' doesn't work with wide chars, so I will write my own version.
>
> When I shrink or grow char arrays, I'd like to know if I should re-set my
> pointers into them accordingly.
>
> If anyone can point out any other obvious issues, bad style etc. that would be
> appreciated. Please bear in mind that I'd like the code to be as fast as possible.
>
> Here's the source:
>
> Regards,
>
> Ben
>
> module main;
>
> import std.algorithm;
> import std.array;
> import std.c.string;
> import std.string;
>
> import std.stdio;
>
> template regex(CharT)
> {
> struct basic_string_token
> {
> bool _negated = false;
> CharT[] _charset;
> enum size_t MAX_CHARS = CharT.max + 1;
> enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>
> this(const bool negated_, ref CharT[] charset_)
> {
> _negated = negated_;
> _charset = charset_;
> }
>
> void remove_duplicates()
> {
> _charset.sort;
> _charset = squeeze(_charset.idup).dup;
> }
>
> void normalise()
> {
> if (_charset.length == MAX_CHARS)
> {
> _negated = !_negated;
> _charset.clear();
> }
> else if (_charset.length > MAX_CHARS / 2)
> {
> negate();
> }
> }
>
> void negate()
> {
> CharT curr_char_ = START_CHAR;
> CharT[] temp_;
> CharT *ptr_;
> CharT *curr_ = _charset.ptr;
> CharT *end_ = curr_ + _charset.length;
> size_t i_ = 0;
>
> _negated = !_negated;
> temp_.length = MAX_CHARS - _charset.length;
> ptr_ = temp_.ptr;
>
> while (curr_ < end_)
> {
> while (*curr_ > curr_char_)
> {
> *ptr_ = curr_char_;
> ++ptr_;
> ++curr_char_;
> ++i_;
> }
>
> ++curr_char_;
> ++curr_;
> ++i_;
> }
>
> for (; i_ < MAX_CHARS; ++i_)
> {
> *ptr_ = curr_char_;
> ++ptr_;
> ++curr_char_;
> }
>
> _charset = temp_;
> }
>
> bool empty()
> {
> return _charset.length == 0 && !_negated;
> }
>
> bool any()
> {
> return _charset.length == 0 && _negated;
> }
>
> void clear()
> {
> _negated = false;
> _charset.length = 0;
> }
>
> void intersect(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if ((any() && rhs_.any()) || (_negated == rhs_._negated &&
> !any() && !rhs_.any()))
> {
> intersect_same_types(rhs_, overlap_);
> }
> else
> {
> intersect_diff_types(rhs_, overlap_);
> }
> }
>
> private:
> void intersect_same_types(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (any())
> {
> clear();
> overlap_._negated = true;
> rhs_.clear();
> }
> else
> {
> CharT *iter_ = _charset.ptr;
> CharT *end_ = iter_ + _charset.length;
> CharT *rhs_iter_ = rhs_._charset.ptr;
> CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
>
> overlap_._negated = _negated;
>
> while (iter_ != end_ && rhs_iter_ != rhs_end_)
> {
> if (*iter_ < *rhs_iter_)
> {
> ++iter_;
> }
> else if (*iter_ > *rhs_iter_)
> {
> ++rhs_iter_;
> }
> else
> {
> overlap_._charset ~= *iter_;
> memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> --end_;
> _charset.length -= 1;
> memmove(rhs_iter_, rhs_iter_ + 1, rhs_._charset.ptr +
> rhs_._charset.length - rhs_iter_);
> --rhs_end_;
> rhs_._charset.length -= 1;
> }
> }
>
> if (_negated)
> {
> // duplicates already merged
> // src, dest
> merge(_charset, overlap_._charset);
> // duplicates already merged
> // src, dest
> merge(rhs_._charset, overlap_._charset);
> _negated = false;
> rhs_._negated = false;
> swap(_charset, rhs_._charset);
> normalise();
> overlap_.normalise();
> rhs_.normalise();
> }
> else if (!overlap_._charset.length == 0)
> {
> normalise();
> overlap_.normalise();
> rhs_.normalise();
> }
> }
> }
>
> void intersect_diff_types(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (any())
> {
> intersect_any(rhs_, overlap_);
> }
> else if (_negated)
> {
> intersect_negated(rhs_, overlap_);
> }
> else // _negated == false
> {
> intersect_charset(rhs_, overlap_);
> }
> }
>
> void intersect_any(ref basic_string_token rhs_, ref basic_string_token overlap_)
> {
> if (rhs_._negated)
> {
> rhs_.intersect_negated(this, overlap_);
> }
> else // rhs._negated == false
> {
> rhs_.intersect_charset(this, overlap_);
> }
> }
>
> void intersect_negated(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (rhs_.any())
> {
> overlap_._negated = true;
> overlap_._charset = _charset;
> rhs_._negated = false;
> rhs_._charset = _charset;
> clear();
> }
> else // rhs._negated == false
> {
> rhs_.intersect_charset(this, overlap_);
> }
> }
>
> void intersect_charset(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (rhs_.any())
> {
> overlap_._charset = _charset;
> rhs_._negated = true;
> rhs_._charset = _charset;
> clear();
> }
> else // rhs_._negated == true
> {
> CharT *iter_ = _charset.ptr;
> CharT *end_ = iter_ + _charset.length;
> CharT *rhs_iter_ = rhs_._charset.ptr;
> CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
>
> while (iter_ != end_ && rhs_iter_ != rhs_end_)
> {
> if (*iter_ < *rhs_iter_)
> {
> overlap_._charset ~= *iter_;
> rhs_._charset.length += 1;
> rhs_iter_ = rhs_._charset.ptr;
> rhs_end_ = rhs_iter_ + rhs_._charset.length;
> memmove(rhs_iter_ + 1, rhs_iter_, rhs_._charset.length -
> (rhs_end_ - rhs_iter_ - 1));
> ++rhs_iter_;
> memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> _charset.length -= 1;
> --end_;
> }
> else if (*iter_ > *rhs_iter_)
> {
> ++rhs_iter_;
> }
> else
> {
> ++iter_;
> ++rhs_iter_;
> }
> }
>
> if (iter_ != end_)
> {
> CharT[] temp_;
>
> temp_.length = end_ - iter_;
> memmove(temp_.ptr, iter_, temp_.length);
>
> // nothing bigger in rhs_ than iter_
> // src, dest
> merge(temp_, overlap_._charset);
> memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> _charset.length -= 1;
> }
>
> if (!overlap_._charset.empty())
> {
> merge(overlap_._charset, rhs_._charset);
> // possible duplicates, so check for any and erase.
> rhs_._charset = squeeze(rhs_._charset.idup).dup;
> normalise();
> overlap_.normalise();
> rhs_.normalise();
> }
> }
> }
>
> void merge(ref CharT[] src_, ref CharT[] dest_)
> {
> CharT[] temp_;
> CharT *ptr_;
> CharT *iter_ = src_.ptr;
> CharT *end_ = iter_ + src_.length;
> CharT *dest_iter_ = dest_.ptr;
> CharT *dest_end_ = dest_iter_ + dest_.length;
>
> temp_.length = src_.length + dest_.length;
> ptr_ = temp_.ptr;
>
> while (iter_ != end_ && dest_iter_ != dest_end_)
> {
> if (*iter_ < *dest_iter_)
> {
> *ptr_++ = *iter_++;
> }
> else
> {
> *ptr_++ = *dest_iter_++;
> }
> }
>
> while (iter_ != end_)
> {
> *ptr_++ = *iter_++;
> }
>
> while (dest_iter_ != dest_end_)
> {
> *ptr_++ = *dest_iter_++;
> }
>
> dest_ = temp_;
> }
> };
> }
>
> int main(char[][]argv)
> {
> regex!(char).basic_string_token lhs_;
> regex!(char).basic_string_token rhs_;
> regex!(char).basic_string_token intersect_;
>
> lhs_._charset = "abc".dup;
> lhs_._negated = true;
> rhs_._charset = "bcd".dup;
> rhs_._negated = true;
> writeln(lhs_._charset, '(', lhs_._negated, ") intersect ",
> rhs_._charset, '(', rhs_._negated, ") =");
> lhs_.intersect(rhs_, intersect_);
> writeln(lhs_._charset, '(', lhs_._negated, "), ",
> rhs_._charset, '(', rhs_._negated, "), ",
> intersect_._charset, '(', intersect_._negated, ')');
> return 0;
> }
| |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | == Quote from bearophile (bearophileHUGS@lycos.com)'s article > Ben Hanson: > > It still needs some work as memmove works with bytes so I need the equivalent of 'sizeof' in D for this. > T.sizeof gives the size of the init of a variable of type T. > If T is a dynamic array it returns wordsSize*2, so if you need the item size you can write T[0].sizeof. > Why do you use so many underscores? > Bye, > bearophile D'oh! I think I've seen that about now you mention it... The underscores thing just comes from the C++ source. I was recommended that approach, as not wanting to use Reverse Polish Notation (i.e. MFC style), the underscores allow you to have a type the same name as a member var or local var. Thanks, Ben | |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Rory McGuire | == Quote from Rory McGuire (rmcguire@neonova.co.za)'s article
> I think sizeof is a property, e.g. char.sizeof or:
> struct A {
> int a;
> char[10] b;
> }
> A.sizeof
> A a;
> a.sizeof
> you get the picture. (Have I got it?)
> -Rory
Thanks Rory.
| |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Ben Hanson | Ben Hanson: > The underscores thing just comes from the C++ source. But once your program works well you can change the variable names a little, or even before if you have some kind of IDE. In D style guide structs and classes need to start with an upper case, in CamelCase. And variable names are written in camelCase with a starting lower case: http://www.digitalmars.com/d/2.0/dstyle.html Following a common style guide is important. > I was recommended that > approach, as not wanting to use Reverse Polish Notation (i.e. MFC style), I think you mean polish with no reverse :-) > the underscores allow you to have a type the same name as a member var or local var. I don't understand. Why can't you write the code like this? struct BasicStringToken { enum size_t MAX_CHARS = CharT.max + 1; enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0; private bool negated = false; private CharT[] charset; this(const bool negated_, ref CharT[] charset_) { negated = negated_; charset = charset_; } I have kept the underscores in the arguments of the method because they have a limited scope/life, so they don't add a lot of noise to the whole code. Bye, bearophile | |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | == Quote from bearophile (bearophileHUGS@lycos.com)'s article > Ben Hanson: > > The underscores thing just comes from the C++ source. > But once your program works well you can change the variable names a little, or even before if you have some kind of IDE. > In D style guide structs and classes need to start with an upper case, in CamelCase. And variable names are written in camelCase with a starting lower case: > http://www.digitalmars.com/d/2.0/dstyle.html > Following a common style guide is important. I'm happy to change the naming style when things are running. > > I was recommended that > > approach, as not wanting to use Reverse Polish Notation (i.e. MFC style), > I think you mean polish with no reverse :-) RPN is for compilers isn't it?! I think it was called Hungarian Notation (possibly)... > > the underscores allow you to have a type the same name as a member var or local var. > I don't understand. > Why can't you write the code like this? > struct BasicStringToken > { > enum size_t MAX_CHARS = CharT.max + 1; > enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0; > private bool negated = false; > private CharT[] charset; > this(const bool negated_, ref CharT[] charset_) > { > negated = negated_; > charset = charset_; > } > I have kept the underscores in the arguments of the method because they have a limited scope/life, so they don't add a lot of noise to the whole code. The code so far isn't a good example of that, but it's generally when you typedefed a template and then wanted to name a var with the same name as the type. Regardles, as you pointed out, D does things differently, which is fine. Regards, Ben | |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Ben Hanson | > == Quote from bearophile (bearophileHUGS@lycos.com)'s article
> > In D style guide structs and classes need to start with an upper case, in
> CamelCase. And variable names are written in camelCase with a starting lower case:
> > http://www.digitalmars.com/d/2.0/dstyle.html
> > Following a common style guide is important.
Actually, I may as well convert as I go.
Regards,
Ben
| |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | Here's the latest with naming convention (hopefully) followed. I've implemented my
own squeeze() function and used sizeof in the memmove calls.
How can I specify wide strings for the literals?
Thanks,
Ben
module main;
import std.algorithm;
import std.array;
import std.c.string;
import std.string;
import std.stdio;
template regex(CharT)
{
struct BasicStringToken
{
bool negated = false;
CharT[] charset;
enum size_t MAX_CHARS = CharT.max + 1;
enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
this(const bool negated_, ref CharT[] charset_)
{
negated = negated_;
charset = charset_;
}
void removeDuplicates()
{
charset.sort;
squeeze(charset);
}
void normalise()
{
if (charset.length == MAX_CHARS)
{
negated = !negated;
charset.clear();
}
else if (charset.length > MAX_CHARS / 2)
{
negate();
}
}
void negate()
{
CharT curr_char = START_CHAR;
CharT[] temp;
CharT *ptr = cast(CharT *) 0;
CharT *curr = charset.ptr;
CharT *end = curr + charset.length;
size_t i = 0;
negated = !negated;
temp.length = MAX_CHARS - charset.length;
ptr = temp.ptr;
while (curr < end)
{
while (*curr > curr_char)
{
*ptr = curr_char;
++ptr;
++curr_char;
++i;
}
++curr_char;
++curr;
++i;
}
for (; i < MAX_CHARS; ++i)
{
*ptr = curr_char;
++ptr;
++curr_char;
}
charset = temp;
}
bool empty()
{
return charset.length == 0 && !negated;
}
bool any()
{
return charset.length == 0 && negated;
}
void clear()
{
negated = false;
charset.length = 0;
}
void intersect(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if ((any() && rhs.any()) || (negated == rhs.negated &&
!any() && !rhs.any()))
{
intersectSameTypes(rhs, overlap);
}
else
{
intersectDiffTypes(rhs, overlap);
}
}
private:
void intersectSameTypes(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (any())
{
clear();
overlap.negated = true;
rhs.clear();
}
else
{
CharT *iter = charset.ptr;
CharT *end = iter + charset.length;
CharT *rhs_iter = rhs.charset.ptr;
CharT *rhs_end = rhs_iter + rhs.charset.length;
overlap.negated = negated;
while (iter != end && rhs_iter != rhs_end)
{
if (*iter < *rhs_iter)
{
++iter;
}
else if (*iter > *rhs_iter)
{
++rhs_iter;
}
else
{
overlap.charset ~= *iter;
memmove(iter, iter + 1, (charset.ptr +
charset.length - iter) * CharT.sizeof);
--end;
charset.length -= 1;
memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
rhs.charset.length - rhs_iter) * CharT.sizeof);
--rhs_end;
rhs.charset.length -= 1;
}
}
if (negated)
{
// duplicates already merged
// src, dest
merge(charset, overlap.charset);
// duplicates already merged
// src, dest
merge(rhs.charset, overlap.charset);
negated = false;
rhs.negated = false;
swap(charset, rhs.charset);
normalise();
overlap.normalise();
rhs.normalise();
}
else if (!overlap.charset.length == 0)
{
normalise();
overlap.normalise();
rhs.normalise();
}
}
}
void intersectDiffTypes(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (any())
{
intersectAny(rhs, overlap);
}
else if (negated)
{
intersectNegated(rhs, overlap);
}
else // negated == false
{
intersectCharset(rhs, overlap);
}
}
void intersectAny(ref BasicStringToken rhs, ref BasicStringToken overlap)
{
if (rhs.negated)
{
rhs.intersectNegated(this, overlap);
}
else // rhs.negated == false
{
rhs.intersectCharset(this, overlap);
}
}
void intersectNegated(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (rhs.any())
{
overlap.negated = true;
overlap.charset = charset;
rhs.negated = false;
rhs.charset = charset;
clear();
}
else // rhs.negated == false
{
rhs.intersectCharset(this, overlap);
}
}
void intersectCharset(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (rhs.any())
{
overlap.charset = charset;
rhs.negated = true;
rhs.charset = charset;
clear();
}
else // rhs.negated == true
{
CharT *iter = charset.ptr;
CharT *end = iter + charset.length;
CharT *rhs_iter = rhs.charset.ptr;
CharT *rhs_end = rhs_iter + rhs.charset.length;
while (iter != end && rhs_iter != rhs_end)
{
if (*iter < *rhs_iter)
{
overlap.charset ~= *iter;
rhs.charset.length += 1;
rhs_iter = rhs.charset.ptr;
rhs_end = rhs_iter + rhs.charset.length;
memmove(rhs_iter + 1, rhs_iter, (rhs.charset.length -
(rhs_end - rhs_iter - 1)) * CharT.sizeof);
++rhs_iter;
memmove(iter, iter + 1, (charset.ptr +
charset.length - iter) * CharT.sizeof);
charset.length -= 1;
--end;
}
else if (*iter > *rhs_iter)
{
++rhs_iter;
}
else
{
++iter;
++rhs_iter;
}
}
if (iter != end)
{
CharT[] temp;
temp.length = end - iter;
memmove(temp.ptr, iter, temp.length * CharT.sizeof);
// nothing bigger in rhs than iter
// src, dest
merge(temp, overlap.charset);
memmove(iter, iter + 1, (charset.ptr +
charset.length - iter) * CharT.sizeof);
charset.length -= 1;
}
if (!overlap.charset.empty())
{
merge(overlap.charset, rhs.charset);
// possible duplicates, so check for any and erase.
squeeze(rhs.charset);
normalise();
overlap.normalise();
rhs.normalise();
}
}
}
void squeeze(ref CharT[] str)
{
if (str.length > 1)
{
CharT *write = str.ptr;
CharT *end = write + str.length;
CharT *read = write + 1;
while (read != end)
{
while (read != end && *read == *write)
{
++read;
}
if (read == end) break;
++write;
if (read > write)
{
*write = *read;
}
++read;
}
str.length = write + 1 - str.ptr;
}
}
void merge(ref CharT[] src, ref CharT[] dest)
{
CharT[] temp;
CharT *ptr;
CharT *iter = src.ptr;
CharT *end = iter + src.length;
CharT *dest_iter = dest.ptr;
CharT *dest_end = dest_iter + dest.length;
temp.length = src.length + dest.length;
ptr = temp.ptr;
while (iter != end && dest_iter != dest_end)
{
if (*iter < *dest_iter)
{
*ptr++ = *iter++;
}
else
{
*ptr++ = *dest_iter++;
}
}
while (iter != end)
{
*ptr++ = *iter++;
}
while (dest_iter != dest_end)
{
*ptr++ = *dest_iter++;
}
dest = temp;
}
};
}
int main(char[][]argv)
{
regex!(char).BasicStringToken lhs;
regex!(char).BasicStringToken rhs;
regex!(char).BasicStringToken intersect;
lhs.charset = "aaabbc".dup;
lhs.negated = true;
lhs.removeDuplicates();
rhs.charset = "bccddd".dup;
rhs.negated = true;
rhs.removeDuplicates();
writeln(lhs.charset, '(', lhs.negated, ") intersect ",
rhs.charset, '(', rhs.negated, ") =");
lhs.intersect(rhs, intersect);
writeln(lhs.charset, '(', lhs.negated, "), ",
rhs.charset, '(', rhs.negated, "), ",
intersect.charset, '(', intersect.negated, ')');
return 0;
}
| |||
June 22, 2010 Re: Latest string_token Code | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Ben Hanson | "the string"w
gives you 16bit I believe. postfix with a 'd' should give you 32bit.
On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <Ben.Hanson@tfbplc.co.uk> wrote:
> Here's the latest with naming convention (hopefully) followed. I've implemented my
> own squeeze() function and used sizeof in the memmove calls.
>
> How can I specify wide strings for the literals?
>
> Thanks,
>
> Ben
>
> module main;
>
> import std.algorithm;
> import std.array;
> import std.c.string;
> import std.string;
>
> import std.stdio;
>
> template regex(CharT)
> {
> struct BasicStringToken
> {
> bool negated = false;
> CharT[] charset;
> enum size_t MAX_CHARS = CharT.max + 1;
> enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>
> this(const bool negated_, ref CharT[] charset_)
> {
> negated = negated_;
> charset = charset_;
> }
>
> void removeDuplicates()
> {
> charset.sort;
> squeeze(charset);
> }
>
> void normalise()
> {
> if (charset.length == MAX_CHARS)
> {
> negated = !negated;
> charset.clear();
> }
> else if (charset.length > MAX_CHARS / 2)
> {
> negate();
> }
> }
>
> void negate()
> {
> CharT curr_char = START_CHAR;
> CharT[] temp;
> CharT *ptr = cast(CharT *) 0;
> CharT *curr = charset.ptr;
> CharT *end = curr + charset.length;
> size_t i = 0;
>
> negated = !negated;
> temp.length = MAX_CHARS - charset.length;
> ptr = temp.ptr;
>
> while (curr < end)
> {
> while (*curr > curr_char)
> {
> *ptr = curr_char;
> ++ptr;
> ++curr_char;
> ++i;
> }
>
> ++curr_char;
> ++curr;
> ++i;
> }
>
> for (; i < MAX_CHARS; ++i)
> {
> *ptr = curr_char;
> ++ptr;
> ++curr_char;
> }
>
> charset = temp;
> }
>
> bool empty()
> {
> return charset.length == 0 && !negated;
> }
>
> bool any()
> {
> return charset.length == 0 && negated;
> }
>
> void clear()
> {
> negated = false;
> charset.length = 0;
> }
>
> void intersect(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if ((any() && rhs.any()) || (negated == rhs.negated &&
> !any() && !rhs.any()))
> {
> intersectSameTypes(rhs, overlap);
> }
> else
> {
> intersectDiffTypes(rhs, overlap);
> }
> }
>
> private:
> void intersectSameTypes(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (any())
> {
> clear();
> overlap.negated = true;
> rhs.clear();
> }
> else
> {
> CharT *iter = charset.ptr;
> CharT *end = iter + charset.length;
> CharT *rhs_iter = rhs.charset.ptr;
> CharT *rhs_end = rhs_iter + rhs.charset.length;
>
> overlap.negated = negated;
>
> while (iter != end && rhs_iter != rhs_end)
> {
> if (*iter < *rhs_iter)
> {
> ++iter;
> }
> else if (*iter > *rhs_iter)
> {
> ++rhs_iter;
> }
> else
> {
> overlap.charset ~= *iter;
> memmove(iter, iter + 1, (charset.ptr +
> charset.length - iter) * CharT.sizeof);
> --end;
> charset.length -= 1;
> memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
> rhs.charset.length - rhs_iter) * CharT.sizeof);
> --rhs_end;
> rhs.charset.length -= 1;
> }
> }
>
> if (negated)
> {
> // duplicates already merged
> // src, dest
> merge(charset, overlap.charset);
> // duplicates already merged
> // src, dest
> merge(rhs.charset, overlap.charset);
> negated = false;
> rhs.negated = false;
> swap(charset, rhs.charset);
> normalise();
> overlap.normalise();
> rhs.normalise();
> }
> else if (!overlap.charset.length == 0)
> {
> normalise();
> overlap.normalise();
> rhs.normalise();
> }
> }
> }
>
> void intersectDiffTypes(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (any())
> {
> intersectAny(rhs, overlap);
> }
> else if (negated)
> {
> intersectNegated(rhs, overlap);
> }
> else // negated == false
> {
> intersectCharset(rhs, overlap);
> }
> }
>
> void intersectAny(ref BasicStringToken rhs, ref BasicStringToken overlap)
> {
> if (rhs.negated)
> {
> rhs.intersectNegated(this, overlap);
> }
> else // rhs.negated == false
> {
> rhs.intersectCharset(this, overlap);
> }
> }
>
> void intersectNegated(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (rhs.any())
> {
> overlap.negated = true;
> overlap.charset = charset;
> rhs.negated = false;
> rhs.charset = charset;
> clear();
> }
> else // rhs.negated == false
> {
> rhs.intersectCharset(this, overlap);
> }
> }
>
> void intersectCharset(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (rhs.any())
> {
> overlap.charset = charset;
> rhs.negated = true;
> rhs.charset = charset;
> clear();
> }
> else // rhs.negated == true
> {
> CharT *iter = charset.ptr;
> CharT *end = iter + charset.length;
> CharT *rhs_iter = rhs.charset.ptr;
> CharT *rhs_end = rhs_iter + rhs.charset.length;
>
> while (iter != end && rhs_iter != rhs_end)
> {
> if (*iter < *rhs_iter)
> {
> overlap.charset ~= *iter;
> rhs.charset.length += 1;
> rhs_iter = rhs.charset.ptr;
> rhs_end = rhs_iter + rhs.charset.length;
> memmove(rhs_iter + 1, rhs_iter, (rhs.charset.length -
> (rhs_end - rhs_iter - 1)) * CharT.sizeof);
> ++rhs_iter;
> memmove(iter, iter + 1, (charset.ptr +
> charset.length - iter) * CharT.sizeof);
> charset.length -= 1;
> --end;
> }
> else if (*iter > *rhs_iter)
> {
> ++rhs_iter;
> }
> else
> {
> ++iter;
> ++rhs_iter;
> }
> }
>
> if (iter != end)
> {
> CharT[] temp;
>
> temp.length = end - iter;
> memmove(temp.ptr, iter, temp.length * CharT.sizeof);
>
> // nothing bigger in rhs than iter
> // src, dest
> merge(temp, overlap.charset);
> memmove(iter, iter + 1, (charset.ptr +
> charset.length - iter) * CharT.sizeof);
> charset.length -= 1;
> }
>
> if (!overlap.charset.empty())
> {
> merge(overlap.charset, rhs.charset);
> // possible duplicates, so check for any and erase.
> squeeze(rhs.charset);
> normalise();
> overlap.normalise();
> rhs.normalise();
> }
> }
> }
>
> void squeeze(ref CharT[] str)
> {
> if (str.length > 1)
> {
> CharT *write = str.ptr;
> CharT *end = write + str.length;
> CharT *read = write + 1;
>
> while (read != end)
> {
> while (read != end && *read == *write)
> {
> ++read;
> }
>
> if (read == end) break;
>
> ++write;
>
> if (read > write)
> {
> *write = *read;
> }
>
> ++read;
> }
>
> str.length = write + 1 - str.ptr;
> }
> }
>
> void merge(ref CharT[] src, ref CharT[] dest)
> {
> CharT[] temp;
> CharT *ptr;
> CharT *iter = src.ptr;
> CharT *end = iter + src.length;
> CharT *dest_iter = dest.ptr;
> CharT *dest_end = dest_iter + dest.length;
>
> temp.length = src.length + dest.length;
> ptr = temp.ptr;
>
> while (iter != end && dest_iter != dest_end)
> {
> if (*iter < *dest_iter)
> {
> *ptr++ = *iter++;
> }
> else
> {
> *ptr++ = *dest_iter++;
> }
> }
>
> while (iter != end)
> {
> *ptr++ = *iter++;
> }
>
> while (dest_iter != dest_end)
> {
> *ptr++ = *dest_iter++;
> }
>
> dest = temp;
> }
> };
> }
>
> int main(char[][]argv)
> {
> regex!(char).BasicStringToken lhs;
> regex!(char).BasicStringToken rhs;
> regex!(char).BasicStringToken intersect;
>
> lhs.charset = "aaabbc".dup;
> lhs.negated = true;
> lhs.removeDuplicates();
> rhs.charset = "bccddd".dup;
> rhs.negated = true;
> rhs.removeDuplicates();
> writeln(lhs.charset, '(', lhs.negated, ") intersect ",
> rhs.charset, '(', rhs.negated, ") =");
> lhs.intersect(rhs, intersect);
> writeln(lhs.charset, '(', lhs.negated, "), ",
> rhs.charset, '(', rhs.negated, "), ",
> intersect.charset, '(', intersect.negated, ')');
> return 0;
> }
| |||
Copyright © 1999-2021 by the D Language Foundation
Permalink
Reply