Thread overview | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
April 03, 2006 How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
At first, thank all my helpers. :) I have some email addresses in a text file, about 780000 lines. There are some repeated lines, about 8%, and I want to remove them. I write a D version: ======================================= import std.stdio; import std.string; import std.stdio; import std.string; import std.perf; int main(char[][] argv) { if (argv.length < 3) { writefln("Wrong arguments"); return 1; } const int READ_SIZE = 1024; FILE* fin = fopen(argv[1], "r"); FILE* fout = fopen(argv[2], "w"); char buffer[READ_SIZE]; int[char[]] emails; PerformanceCounter counter = new PerformanceCounter(); counter.start(); while (!feof(fin)){ fgets(cast(char*)buffer, READ_SIZE, fin); char[] email = toString(cast(char*)buffer); if (!(email in emails)){ emails[toString(buffer)] = 0; fputs(cast(char*)email, fout); } } fclose(fout); fclose(fin); counter.stop(); writefln(counter.milliseconds()); return 0; } ====================================== It used 1080 ms on my computer. A optimized c++ version: ====================================== #include <iostream> #include <string> #include <fstream> #include <iterator> #include <sys/time.h> #include <vector> using namespace std; size_t my_hash (const char* str) { size_t ret = 0; while (*str) ret = 11 * ret + *str++; return ret; } class Email { private: size_t hash; const char* mail; friend bool operator < (const Email& lhs, const Email& rhs); public: Email (const char* mail_) : hash(my_hash(mail_)), mail(mail_) { } bool operator == (const Email& rhs) { if (hash == rhs.hash) return strcmp(mail, rhs.mail) == 0; return false; } const char* getEmail()const { return mail; } }; bool operator < (const Email& lhs, const Email& rhs) { if (lhs.hash == rhs.hash) return strcmp(lhs.mail, rhs.mail) < 0; return lhs.hash < rhs.hash; } int main(int argc, char** argv) { if (argc < 3) { cout << "Wrong arguments" << endl; return 1; } FILE* fin = fopen(argv[1], "r"); if (!fin) { cout << "Invalid input file" << endl; return 2; } FILE* fout = fopen(argv[2], "w"); if (!fout) { fclose(fin); cout << "Invalid output file" << endl; return 3; } timeval start, end; const int BUF_SIZE = 20 * 1024 * 1024; char* buffer = new char[BUF_SIZE]; memset(buffer, 0, BUF_SIZE); gettimeofday(&start, 0); vector<Email> emails; size_t read = fread (buffer, BUF_SIZE, 1, fin); char* begin = buffer; char* current = buffer; while (*current != '\0') { if (*current == '\n') { *current = '\0'; emails.push_back(begin); begin = current + 1; } ++ current; } fclose(fin); sort(emails.begin(), emails.end()); emails.erase (unique( emails.begin(), emails.end() ), emails.end()); for (vector<Email>::const_iterator iter = emails.begin(); iter != emails.end(); iter ++) { fputs((*iter).getEmail(), fout); fwrite("\n", 1, 1, fout); } fclose(fout); gettimeofday(&end, 0); printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) / 1000)); delete[] buffer; return 0; } ===================================================== It used 680 ms. I modified the D version to: ==================================================== import std.stdio; import std.string; import std.perf; int main(char[][] argv) { if (argv.length < 3) { writefln("Wrong arguments"); return 1; } const int BUF_SIZE = 20 * 1024 * 1024; char* buffer = new char[BUF_SIZE]; PerformanceCounter counter = new PerformanceCounter(); counter.start(); FILE* fin = fopen(argv[1], "r"); FILE* fout = fopen(argv[2], "w"); fread(buffer, BUF_SIZE, 1, fin); fclose(fin); int[char[]] emails; char* begin = buffer; char* current = begin; char* newline = cast(char*)"\n"; counter.stop(); writefln(counter.milliseconds()); while (*current != '\0') { if (*current == '\n') { *current = '\0'; char[] email = toString(begin); if (!(email in emails)) { emails[email] = 0; fputs (begin, fout); fwrite(newline, 1, 1, fout); } begin = current + 1; } ++ current; } fclose(fout); counter.stop(); writefln(counter.milliseconds()); delete buffer; return 0; } ==================================================== But it used 3600 ms... :( I don't know how to accelerate it in D. :( Can you help me? |
April 03, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Li Jie | Li Jie wrote: > At first, thank all my helpers. :) > > > I have some email addresses in a text file, about 780000 lines. There are some repeated lines, about 8%, and I want to remove them. > > I write a D version: > ======================================= > import std.stdio; > import std.string; > import std.stdio; > import std.string; > import std.perf; > > int main(char[][] argv) > { > if (argv.length < 3) { > writefln("Wrong arguments"); > return 1; > } > > const int READ_SIZE = 1024; > > FILE* fin = fopen(argv[1], "r"); > FILE* fout = fopen(argv[2], "w"); > char buffer[READ_SIZE]; > int[char[]] emails; > > PerformanceCounter counter = new PerformanceCounter(); > counter.start(); > while (!feof(fin)){ > fgets(cast(char*)buffer, READ_SIZE, fin); > char[] email = toString(cast(char*)buffer); > if (!(email in emails)){ > emails[toString(buffer)] = 0; > fputs(cast(char*)email, fout); > } > } > > fclose(fout); > fclose(fin); > counter.stop(); > > writefln(counter.milliseconds()); > return 0; > } > ====================================== > > It used 1080 ms on my computer. > > A optimized c++ version: > ====================================== > #include <iostream> > #include <string> > #include <fstream> > #include <iterator> > #include <sys/time.h> > #include <vector> > using namespace std; > > > size_t my_hash (const char* str) > { > size_t ret = 0; > while (*str) > ret = 11 * ret + *str++; > return ret; > } > > class Email > { > private: > size_t hash; > const char* mail; > friend bool operator < (const Email& lhs, const Email& rhs); > public: > Email (const char* mail_) > : hash(my_hash(mail_)), mail(mail_) > { > } > > bool operator == (const Email& rhs) > { > if (hash == rhs.hash) > return strcmp(mail, rhs.mail) == 0; > return false; > } > > const char* getEmail()const > { > return mail; > } > }; > > bool operator < (const Email& lhs, const Email& rhs) > { > if (lhs.hash == rhs.hash) > return strcmp(lhs.mail, rhs.mail) < 0; > return lhs.hash < rhs.hash; > } > > int main(int argc, char** argv) > { > if (argc < 3) > { > cout << "Wrong arguments" << endl; > return 1; > } > > FILE* fin = fopen(argv[1], "r"); > if (!fin) > { > cout << "Invalid input file" << endl; > return 2; > } > FILE* fout = fopen(argv[2], "w"); > if (!fout) > { > fclose(fin); > cout << "Invalid output file" << endl; > return 3; > } > > timeval start, end; > > const int BUF_SIZE = 20 * 1024 * 1024; > char* buffer = new char[BUF_SIZE]; > memset(buffer, 0, BUF_SIZE); > > gettimeofday(&start, 0); > vector<Email> emails; > > size_t read = fread (buffer, BUF_SIZE, 1, fin); > char* begin = buffer; > char* current = buffer; > > while (*current != '\0') > { > if (*current == '\n') > { > *current = '\0'; > emails.push_back(begin); > begin = current + 1; > } > ++ current; > } > fclose(fin); > sort(emails.begin(), emails.end()); > emails.erase (unique( emails.begin(), emails.end() ), emails.end()); > > for (vector<Email>::const_iterator iter = emails.begin(); > iter != emails.end(); > iter ++) > { > fputs((*iter).getEmail(), fout); > fwrite("\n", 1, 1, fout); > } > > fclose(fout); > > gettimeofday(&end, 0); > > printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + > (end.tv_usec - start.tv_usec) / 1000)); > > delete[] buffer; > > return 0; > } > ===================================================== > It used 680 ms. > > I modified the D version to: > ==================================================== > import std.stdio; > import std.string; > import std.perf; > > int main(char[][] argv) > { > if (argv.length < 3) { > writefln("Wrong arguments"); > return 1; > } > > const int BUF_SIZE = 20 * 1024 * 1024; > char* buffer = new char[BUF_SIZE]; > > PerformanceCounter counter = new PerformanceCounter(); > counter.start(); > > FILE* fin = fopen(argv[1], "r"); > FILE* fout = fopen(argv[2], "w"); > > fread(buffer, BUF_SIZE, 1, fin); > fclose(fin); > > int[char[]] emails; > char* begin = buffer; > char* current = begin; > char* newline = cast(char*)"\n"; > > counter.stop(); > writefln(counter.milliseconds()); > > while (*current != '\0') > { > if (*current == '\n') > { > *current = '\0'; > char[] email = toString(begin); > if (!(email in emails)) > { > emails[email] = 0; > fputs (begin, fout); > fwrite(newline, 1, 1, fout); > } > begin = current + 1; > } > ++ current; > } > > fclose(fout); > counter.stop(); > > writefln(counter.milliseconds()); > > delete buffer; > return 0; > } > ==================================================== > > But it used 3600 ms... :( > > > I don't know how to accelerate it in D. :( > Can you help me? I would recommend using the Mango library and it's textual iterators. See the example at: http://www.dsource.org/projects/mango/browser/trunk/example/lineio.d A few tweaks to that example should get you on your way. Also, when compiling, make sure to use the -O -inlinc -release flags. -O optimizes, -inline allows smaller functions to be inlined, and -release removes things like bound checking. I've found that using these flags improves performance significantly. ~John Demme |
April 03, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Li Jie |
Li Jie wrote:
> At first, thank all my helpers. :)
>
>
> I have some email addresses in a text file, about 780000 lines. There are some
> repeated lines, about 8%, and I want to remove them.
>
> I write a D version:
> =======================================
> import std.stdio;
> import std.string;
> import std.stdio;
> import std.string;
> import std.perf;
>
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
>
> const int READ_SIZE = 1024;
>
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
> char buffer[READ_SIZE];
> int[char[]] emails;
>
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
> while (!feof(fin)){
> fgets(cast(char*)buffer, READ_SIZE, fin);
> char[] email = toString(cast(char*)buffer);
> if (!(email in emails)){
> emails[toString(buffer)] = 0;
> fputs(cast(char*)email, fout);
> }
> }
>
> fclose(fout);
> fclose(fin);
> counter.stop();
>
> writefln(counter.milliseconds());
> return 0;
> }
> ======================================
>
> It used 1080 ms on my computer.
>
> A optimized c++ version:
> ======================================
> #include <iostream>
> #include <string>
> #include <fstream>
> #include <iterator>
> #include <sys/time.h>
> #include <vector>
> using namespace std;
>
>
> size_t my_hash (const char* str)
> {
> size_t ret = 0;
> while (*str)
> ret = 11 * ret + *str++;
> return ret;
> }
>
> class Email
> {
> private:
> size_t hash;
> const char* mail;
> friend bool operator < (const Email& lhs, const Email& rhs);
> public:
> Email (const char* mail_)
> : hash(my_hash(mail_)), mail(mail_)
> {
> }
>
> bool operator == (const Email& rhs)
> {
> if (hash == rhs.hash)
> return strcmp(mail, rhs.mail) == 0;
> return false;
> }
>
> const char* getEmail()const
> {
> return mail;
> }
> };
>
> bool operator < (const Email& lhs, const Email& rhs)
> {
> if (lhs.hash == rhs.hash)
> return strcmp(lhs.mail, rhs.mail) < 0;
> return lhs.hash < rhs.hash;
> }
>
> int main(int argc, char** argv)
> {
> if (argc < 3)
> {
> cout << "Wrong arguments" << endl;
> return 1;
> }
>
> FILE* fin = fopen(argv[1], "r");
> if (!fin)
> {
> cout << "Invalid input file" << endl;
> return 2;
> }
> FILE* fout = fopen(argv[2], "w");
> if (!fout)
> {
> fclose(fin);
> cout << "Invalid output file" << endl;
> return 3;
> }
>
> timeval start, end;
>
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
> memset(buffer, 0, BUF_SIZE);
>
> gettimeofday(&start, 0);
> vector<Email> emails;
>
> size_t read = fread (buffer, BUF_SIZE, 1, fin);
> char* begin = buffer;
> char* current = buffer;
>
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> emails.push_back(begin);
> begin = current + 1;
> }
> ++ current;
> }
> fclose(fin);
> sort(emails.begin(), emails.end());
> emails.erase (unique( emails.begin(), emails.end() ), emails.end());
>
> for (vector<Email>::const_iterator iter = emails.begin();
> iter != emails.end();
> iter ++)
> {
> fputs((*iter).getEmail(), fout);
> fwrite("\n", 1, 1, fout);
> }
>
> fclose(fout);
>
> gettimeofday(&end, 0);
>
> printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
> - start.tv_usec) / 1000));
>
> delete[] buffer;
>
> return 0;
> }
> =====================================================
> It used 680 ms.
>
> I modified the D version to:
> ====================================================
> import std.stdio;
> import std.string;
> import std.perf;
>
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
>
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
>
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
>
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
>
> fread(buffer, BUF_SIZE, 1, fin);
> fclose(fin);
>
> int[char[]] emails;
> char* begin = buffer;
> char* current = begin;
> char* newline = cast(char*)"\n";
>
> counter.stop();
> writefln(counter.milliseconds());
>
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> char[] email = toString(begin);
> if (!(email in emails))
> {
> emails[email] = 0;
> fputs (begin, fout);
> fwrite(newline, 1, 1, fout);
> }
> begin = current + 1;
> }
> ++ current;
> }
>
> fclose(fout);
> counter.stop();
>
> writefln(counter.milliseconds());
>
> delete buffer;
> return 0;
> }
> ====================================================
>
> But it used 3600 ms... :(
>
>
> I don't know how to accelerate it in D. :(
> Can you help me?
>
>
Two improvements based on your first D version:
0. Output in a separate loop.
1. Remove the "if(!(email in emails))" check.
Code:
while(!feof(fin)){
fgets(cast(char*)buffer, READ_SIZE, fin);
emails[toString(buffer)] = 0;
}
foreach(char[] k, int v; emails)
fputs(cast(char*)k, fout);
|
April 03, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Li Jie | From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a buffer for each readline. On my system with a file made of 440,000 lines of (just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the built-in AA's are pretty fast. Let us know your results (you'll have a lot more AA inserts and email dups). Note you have to dup the string when you insert into your AA. Try this (dmd v0.150): import std.stdio; import std.stream; import std.perf; int main(char[][] argv) { if (argv.length < 3) { writefln("Wrong arguments"); return 1; } char[8192] bufr; int[char[]] emails; char[] email; PerformanceCounter counter = new PerformanceCounter(); counter.start(); BufferedFile bsi = new BufferedFile(argv[1]); BufferedFile bso = new BufferedFile(argv[2],FileMode.Out); while(!bsi.eof) { email = bsi.readLine(bufr); // bufr is key to perf. if (!(email in emails)) { emails[email.dup] = 0; // Note .dup } } foreach(key; emails.keys.sort) { bso.write(cast(ubyte[])key); bso.write('\n'); } bso.close; bsi.close; counter.stop(); writefln(counter.milliseconds()); return 0; } In article <e0qqb3$3gt$1@digitaldaemon.com>, Li Jie says... > >At first, thank all my helpers. :) > > >I have some email addresses in a text file, about 780000 lines. There are some repeated lines, about 8%, and I want to remove them. > >I write a D version: >======================================= >import std.stdio; >import std.string; >import std.stdio; >import std.string; >import std.perf; > >int main(char[][] argv) >{ >if (argv.length < 3) { >writefln("Wrong arguments"); >return 1; >} > >const int READ_SIZE = 1024; > >FILE* fin = fopen(argv[1], "r"); >FILE* fout = fopen(argv[2], "w"); >char buffer[READ_SIZE]; >int[char[]] emails; > >PerformanceCounter counter = new PerformanceCounter(); >counter.start(); >while (!feof(fin)){ >fgets(cast(char*)buffer, READ_SIZE, fin); >char[] email = toString(cast(char*)buffer); >if (!(email in emails)){ >emails[toString(buffer)] = 0; >fputs(cast(char*)email, fout); >} >} > >fclose(fout); >fclose(fin); >counter.stop(); > >writefln(counter.milliseconds()); >return 0; >} >====================================== > >It used 1080 ms on my computer. > >A optimized c++ version: >====================================== >#include <iostream> >#include <string> >#include <fstream> >#include <iterator> >#include <sys/time.h> >#include <vector> >using namespace std; > > >size_t my_hash (const char* str) >{ >size_t ret = 0; >while (*str) >ret = 11 * ret + *str++; >return ret; >} > >class Email >{ >private: >size_t hash; >const char* mail; >friend bool operator < (const Email& lhs, const Email& rhs); >public: >Email (const char* mail_) >: hash(my_hash(mail_)), mail(mail_) >{ >} > >bool operator == (const Email& rhs) >{ >if (hash == rhs.hash) >return strcmp(mail, rhs.mail) == 0; >return false; >} > >const char* getEmail()const >{ >return mail; >} >}; > >bool operator < (const Email& lhs, const Email& rhs) >{ >if (lhs.hash == rhs.hash) >return strcmp(lhs.mail, rhs.mail) < 0; >return lhs.hash < rhs.hash; >} > >int main(int argc, char** argv) >{ >if (argc < 3) >{ >cout << "Wrong arguments" << endl; >return 1; >} > >FILE* fin = fopen(argv[1], "r"); >if (!fin) >{ >cout << "Invalid input file" << endl; >return 2; >} >FILE* fout = fopen(argv[2], "w"); >if (!fout) >{ >fclose(fin); >cout << "Invalid output file" << endl; >return 3; >} > >timeval start, end; > >const int BUF_SIZE = 20 * 1024 * 1024; >char* buffer = new char[BUF_SIZE]; >memset(buffer, 0, BUF_SIZE); > >gettimeofday(&start, 0); >vector<Email> emails; > >size_t read = fread (buffer, BUF_SIZE, 1, fin); >char* begin = buffer; >char* current = buffer; > >while (*current != '\0') >{ >if (*current == '\n') >{ >*current = '\0'; >emails.push_back(begin); >begin = current + 1; >} >++ current; >} >fclose(fin); >sort(emails.begin(), emails.end()); >emails.erase (unique( emails.begin(), emails.end() ), emails.end()); > >for (vector<Email>::const_iterator iter = emails.begin(); >iter != emails.end(); >iter ++) >{ >fputs((*iter).getEmail(), fout); >fwrite("\n", 1, 1, fout); >} > >fclose(fout); > >gettimeofday(&end, 0); > >printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec >- start.tv_usec) / 1000)); > >delete[] buffer; > >return 0; >} >===================================================== >It used 680 ms. > >I modified the D version to: >==================================================== >import std.stdio; >import std.string; >import std.perf; > >int main(char[][] argv) >{ >if (argv.length < 3) { >writefln("Wrong arguments"); >return 1; >} > >const int BUF_SIZE = 20 * 1024 * 1024; >char* buffer = new char[BUF_SIZE]; > >PerformanceCounter counter = new PerformanceCounter(); >counter.start(); > >FILE* fin = fopen(argv[1], "r"); >FILE* fout = fopen(argv[2], "w"); > >fread(buffer, BUF_SIZE, 1, fin); >fclose(fin); > >int[char[]] emails; >char* begin = buffer; >char* current = begin; >char* newline = cast(char*)"\n"; > >counter.stop(); >writefln(counter.milliseconds()); > >while (*current != '\0') >{ >if (*current == '\n') >{ >*current = '\0'; >char[] email = toString(begin); >if (!(email in emails)) >{ >emails[email] = 0; >fputs (begin, fout); >fwrite(newline, 1, 1, fout); >} >begin = current + 1; >} >++ current; >} > >fclose(fout); >counter.stop(); > >writefln(counter.milliseconds()); > >delete buffer; >return 0; >} >==================================================== > >But it used 3600 ms... :( > > >I don't know how to accelerate it in D. :( >Can you help me? > > |
April 03, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Wang Zhen | Wang Zhen wrote:
>
>
> Li Jie wrote:
>
>> At first, thank all my helpers. :)
>>
>>
>> I have some email addresses in a text file, about 780000 lines. There are some
>> repeated lines, about 8%, and I want to remove them.
>>
>> I write a D version:
<snip>
BTW, a simple shell command "sort emails.txt | uniq > output.txt" can get the job done in a few seconds. I assume you went all the trouble writing a D version just for the fun of it. No?
|
April 03, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Wang Zhen | Wang Zhen wrote:
<snip>
>
> BTW, a simple shell command "sort emails.txt | uniq > output.txt" can get the job done in a few seconds. I assume you went all the trouble writing a D version just for the fun of it. No?
or
sort -u emails.txt
|
April 04, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Wang Zhen | In article <e0rcej$10it$1@digitaldaemon.com>, Wang Zhen says... >Two improvements based on your first D version: >0. Output in a separate loop. >1. Remove the "if(!(email in emails))" check. > >Code: > >while(!feof(fin)){ > fgets(cast(char*)buffer, READ_SIZE, fin); > emails[toString(buffer)] = 0; >} >foreach(char[] k, int v; emails) > fputs(cast(char*)k, fout); Thanks. It takes 1080 ms on my system, it's not fast enough. I think "cast(char*)(char[])" and "toString" called too much, and it's very slowly. |
April 04, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Dave | In article <e0re95$12mg$1@digitaldaemon.com>, Dave says... > > >From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a buffer for each readline. On my system with a file made of 440,000 lines of (just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the built-in AA's are pretty fast. > >Let us know your results (you'll have a lot more AA inserts and email dups). Note you have to dup the string when you insert into your AA. > >Try this (dmd v0.150): Thanks Dave. But it takes ~3400 ms on my system, on windows and gentoo. |
April 04, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to BCS | In article <e0s9oi$1uih$1@digitaldaemon.com>, BCS says... > >Wang Zhen wrote: > <snip> >> >> BTW, a simple shell command "sort emails.txt | uniq > output.txt" can get the job done in a few seconds. I assume you went all the trouble writing a D version just for the fun of it. No? > >or > >sort -u emails.txt Thanks. I want to compare the speed of C++, D and python. The best python version only takes 1700 ms, a "stand" c++ version (used set<string>) takes 5000 ms, an optimized c++ version takes 680 ms. I thinks an optimized D version can faster, shorter and easier to write than the c++ version. |
April 04, 2006 Re: How to accelerate this program? | ||||
---|---|---|---|---|
| ||||
Posted in reply to Li Jie | In article <e0sltd$2cl7$1@digitaldaemon.com>, Li Jie says... > >In article <e0re95$12mg$1@digitaldaemon.com>, Dave says... >> >> >>From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a buffer for each readline. On my system with a file made of 440,000 lines of (just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the built-in AA's are pretty fast. >> >>Let us know your results (you'll have a lot more AA inserts and email dups). Note you have to dup the string when you insert into your AA. >> >>Try this (dmd v0.150): > >Thanks Dave. > >But it takes ~3400 ms on my system, on windows and gentoo. > There's something screwy going on there... Even if I .dup every line of a 880000 line file with average size e-mails in it, I'm still getting 2x better performance than your C++ version, more in line with this: http://shootout.alioth.debian.org/gp4/benchmark.php?test=knucleotide&lang=all which has similiar operations as your test (I/O and hash table update). I'm using g++ v4.02 and -O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4. Please send: - the output from 'dmd -v' - output from 'll /usr/lib/libphobos.a'. - what type of system is 'your system' - how large is the email file in bytes - which c++ compiler and flags are you using? - dmd compiler flags for your test. There are other ways to make the D code faster but a little less straight-forward. See the above benchmark site for those. Thanks. |
Copyright © 1999-2021 by the D Language Foundation