Jump to page: 1 2
Thread overview
How to accelerate this program?
Apr 03, 2006
Li Jie
Apr 03, 2006
John Demme
Apr 03, 2006
Wang Zhen
Apr 03, 2006
Wang Zhen
Apr 03, 2006
BCS
Apr 04, 2006
Li Jie
Apr 04, 2006
Li Jie
Apr 04, 2006
Dave
Apr 03, 2006
Dave
Apr 04, 2006
Li Jie
Apr 04, 2006
Dave
Apr 04, 2006
Li Jie
Apr 04, 2006
Dave
Apr 04, 2006
Lionello Lunesu
Apr 04, 2006
Dave
Apr 04, 2006
Georg Wrede
Apr 05, 2006
Li Jie
April 03, 2006
At first, thank all my helpers. :)


I have some email addresses in a text file, about 780000 lines. There are some repeated lines, about 8%, and I want to remove them.

I write a D version:
=======================================
import std.stdio;
import std.string;
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int READ_SIZE = 1024;

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");
char buffer[READ_SIZE];
int[char[]] emails;

PerformanceCounter counter = new PerformanceCounter();
counter.start();
while (!feof(fin)){
fgets(cast(char*)buffer, READ_SIZE, fin);
char[] email = toString(cast(char*)buffer);
if (!(email in emails)){
emails[toString(buffer)] = 0;
fputs(cast(char*)email, fout);
}
}

fclose(fout);
fclose(fin);
counter.stop();

writefln(counter.milliseconds());
return 0;
}
======================================

It used 1080 ms on my computer.

A optimized c++ version:
======================================
#include <iostream>
#include <string>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <vector>
using namespace std;


size_t my_hash (const char* str)
{
size_t ret = 0;
while (*str)
ret = 11 * ret + *str++;
return ret;
}

class Email
{
private:
size_t hash;
const char* mail;
friend bool operator < (const Email& lhs, const Email& rhs);
public:
Email (const char* mail_)
: hash(my_hash(mail_)), mail(mail_)
{
}

bool operator == (const Email& rhs)
{
if (hash == rhs.hash)
return strcmp(mail, rhs.mail) == 0;
return false;
}

const char* getEmail()const
{
return mail;
}
};

bool operator < (const Email& lhs, const Email& rhs)
{
if (lhs.hash == rhs.hash)
return strcmp(lhs.mail, rhs.mail) < 0;
return lhs.hash < rhs.hash;
}

int main(int argc, char** argv)
{
if (argc < 3)
{
cout << "Wrong arguments" << endl;
return 1;
}

FILE* fin = fopen(argv[1], "r");
if (!fin)
{
cout << "Invalid input file" << endl;
return 2;
}
FILE* fout = fopen(argv[2], "w");
if (!fout)
{
fclose(fin);
cout << "Invalid output file" << endl;
return 3;
}

timeval start, end;

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];
memset(buffer, 0, BUF_SIZE);

gettimeofday(&start, 0);
vector<Email> emails;

size_t read = fread (buffer, BUF_SIZE, 1, fin);
char* begin = buffer;
char* current = buffer;

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
emails.push_back(begin);
begin = current + 1;
}
++ current;
}
fclose(fin);
sort(emails.begin(), emails.end());
emails.erase (unique( emails.begin(), emails.end() ), emails.end());

for (vector<Email>::const_iterator iter = emails.begin();
iter != emails.end();
iter ++)
{
fputs((*iter).getEmail(), fout);
fwrite("\n", 1, 1, fout);
}

fclose(fout);

gettimeofday(&end, 0);

printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
- start.tv_usec) / 1000));

delete[] buffer;

return 0;
}
=====================================================
It used 680 ms.

I modified the D version to:
====================================================
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];

PerformanceCounter counter = new PerformanceCounter();
counter.start();

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");

fread(buffer, BUF_SIZE, 1, fin);
fclose(fin);

int[char[]] emails;
char* begin = buffer;
char* current = begin;
char* newline = cast(char*)"\n";

counter.stop();
writefln(counter.milliseconds());

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
char[] email = toString(begin);
if (!(email in emails))
{
emails[email] = 0;
fputs (begin, fout);
fwrite(newline, 1, 1, fout);
}
begin = current + 1;
}
++ current;
}

fclose(fout);
counter.stop();

writefln(counter.milliseconds());

delete buffer;
return 0;
}
====================================================

But it used 3600 ms... :(


I don't know how to accelerate it in D. :(
Can you help me?


April 03, 2006
Li Jie wrote:

> At first, thank all my helpers. :)
> 
> 
> I have some email addresses in a text file, about 780000 lines. There are some repeated lines, about 8%, and I want to remove them.
> 
> I write a D version:
> =======================================
> import std.stdio;
> import std.string;
> import std.stdio;
> import std.string;
> import std.perf;
> 
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
> 
> const int READ_SIZE = 1024;
> 
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
> char buffer[READ_SIZE];
> int[char[]] emails;
> 
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
> while (!feof(fin)){
> fgets(cast(char*)buffer, READ_SIZE, fin);
> char[] email = toString(cast(char*)buffer);
> if (!(email in emails)){
> emails[toString(buffer)] = 0;
> fputs(cast(char*)email, fout);
> }
> }
> 
> fclose(fout);
> fclose(fin);
> counter.stop();
> 
> writefln(counter.milliseconds());
> return 0;
> }
> ======================================
> 
> It used 1080 ms on my computer.
> 
> A optimized c++ version:
> ======================================
> #include <iostream>
> #include <string>
> #include <fstream>
> #include <iterator>
> #include <sys/time.h>
> #include <vector>
> using namespace std;
> 
> 
> size_t my_hash (const char* str)
> {
> size_t ret = 0;
> while (*str)
> ret = 11 * ret + *str++;
> return ret;
> }
> 
> class Email
> {
> private:
> size_t hash;
> const char* mail;
> friend bool operator < (const Email& lhs, const Email& rhs);
> public:
> Email (const char* mail_)
> : hash(my_hash(mail_)), mail(mail_)
> {
> }
> 
> bool operator == (const Email& rhs)
> {
> if (hash == rhs.hash)
> return strcmp(mail, rhs.mail) == 0;
> return false;
> }
> 
> const char* getEmail()const
> {
> return mail;
> }
> };
> 
> bool operator < (const Email& lhs, const Email& rhs)
> {
> if (lhs.hash == rhs.hash)
> return strcmp(lhs.mail, rhs.mail) < 0;
> return lhs.hash < rhs.hash;
> }
> 
> int main(int argc, char** argv)
> {
> if (argc < 3)
> {
> cout << "Wrong arguments" << endl;
> return 1;
> }
> 
> FILE* fin = fopen(argv[1], "r");
> if (!fin)
> {
> cout << "Invalid input file" << endl;
> return 2;
> }
> FILE* fout = fopen(argv[2], "w");
> if (!fout)
> {
> fclose(fin);
> cout << "Invalid output file" << endl;
> return 3;
> }
> 
> timeval start, end;
> 
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
> memset(buffer, 0, BUF_SIZE);
> 
> gettimeofday(&start, 0);
> vector<Email> emails;
> 
> size_t read = fread (buffer, BUF_SIZE, 1, fin);
> char* begin = buffer;
> char* current = buffer;
> 
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> emails.push_back(begin);
> begin = current + 1;
> }
> ++ current;
> }
> fclose(fin);
> sort(emails.begin(), emails.end());
> emails.erase (unique( emails.begin(), emails.end() ), emails.end());
> 
> for (vector<Email>::const_iterator iter = emails.begin();
> iter != emails.end();
> iter ++)
> {
> fputs((*iter).getEmail(), fout);
> fwrite("\n", 1, 1, fout);
> }
> 
> fclose(fout);
> 
> gettimeofday(&end, 0);
> 
> printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 +
> (end.tv_usec - start.tv_usec) / 1000));
> 
> delete[] buffer;
> 
> return 0;
> }
> =====================================================
> It used 680 ms.
> 
> I modified the D version to:
> ====================================================
> import std.stdio;
> import std.string;
> import std.perf;
> 
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
> 
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
> 
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
> 
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
> 
> fread(buffer, BUF_SIZE, 1, fin);
> fclose(fin);
> 
> int[char[]] emails;
> char* begin = buffer;
> char* current = begin;
> char* newline = cast(char*)"\n";
> 
> counter.stop();
> writefln(counter.milliseconds());
> 
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> char[] email = toString(begin);
> if (!(email in emails))
> {
> emails[email] = 0;
> fputs (begin, fout);
> fwrite(newline, 1, 1, fout);
> }
> begin = current + 1;
> }
> ++ current;
> }
> 
> fclose(fout);
> counter.stop();
> 
> writefln(counter.milliseconds());
> 
> delete buffer;
> return 0;
> }
> ====================================================
> 
> But it used 3600 ms... :(
> 
> 
> I don't know how to accelerate it in D. :(
> Can you help me?


I would recommend using the Mango library and it's textual iterators.  See the example at: http://www.dsource.org/projects/mango/browser/trunk/example/lineio.d

A few tweaks to that example should get you on your way.

Also, when compiling, make sure to use the -O -inlinc -release flags.  -O optimizes, -inline allows smaller functions to be inlined, and -release removes things like bound checking.  I've found that using these flags improves performance significantly.

~John Demme
April 03, 2006

Li Jie wrote:
> At first, thank all my helpers. :)
> 
> 
> I have some email addresses in a text file, about 780000 lines. There are some
> repeated lines, about 8%, and I want to remove them.
> 
> I write a D version:
> =======================================
> import std.stdio;
> import std.string;
> import std.stdio;
> import std.string;
> import std.perf;
> 
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
> 
> const int READ_SIZE = 1024;
> 
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
> char buffer[READ_SIZE];
> int[char[]] emails;
> 
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
> while (!feof(fin)){
> fgets(cast(char*)buffer, READ_SIZE, fin);
> char[] email = toString(cast(char*)buffer);
> if (!(email in emails)){
> emails[toString(buffer)] = 0;
> fputs(cast(char*)email, fout);
> }
> }
> 
> fclose(fout);
> fclose(fin);
> counter.stop();
> 
> writefln(counter.milliseconds());
> return 0;
> }
> ======================================
> 
> It used 1080 ms on my computer.
> 
> A optimized c++ version:
> ======================================
> #include <iostream>
> #include <string>
> #include <fstream>
> #include <iterator>
> #include <sys/time.h>
> #include <vector>
> using namespace std;
> 
> 
> size_t my_hash (const char* str)
> {
> size_t ret = 0;
> while (*str)
> ret = 11 * ret + *str++;
> return ret;
> }
> 
> class Email
> {
> private:
> size_t hash;
> const char* mail;
> friend bool operator < (const Email& lhs, const Email& rhs);
> public:
> Email (const char* mail_)
> : hash(my_hash(mail_)), mail(mail_)
> {
> }
> 
> bool operator == (const Email& rhs)
> {
> if (hash == rhs.hash)
> return strcmp(mail, rhs.mail) == 0;
> return false;
> }
> 
> const char* getEmail()const
> {
> return mail;
> }
> };
> 
> bool operator < (const Email& lhs, const Email& rhs)
> {
> if (lhs.hash == rhs.hash)
> return strcmp(lhs.mail, rhs.mail) < 0;
> return lhs.hash < rhs.hash;
> }
> 
> int main(int argc, char** argv)
> {
> if (argc < 3)
> {
> cout << "Wrong arguments" << endl;
> return 1;
> }
> 
> FILE* fin = fopen(argv[1], "r");
> if (!fin)
> {
> cout << "Invalid input file" << endl;
> return 2;
> }
> FILE* fout = fopen(argv[2], "w");
> if (!fout)
> {
> fclose(fin);
> cout << "Invalid output file" << endl;
> return 3;
> }
> 
> timeval start, end;
> 
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
> memset(buffer, 0, BUF_SIZE);
> 
> gettimeofday(&start, 0);
> vector<Email> emails;
> 
> size_t read = fread (buffer, BUF_SIZE, 1, fin);
> char* begin = buffer;
> char* current = buffer;
> 
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> emails.push_back(begin);
> begin = current + 1;
> }
> ++ current;
> }
> fclose(fin);
> sort(emails.begin(), emails.end());
> emails.erase (unique( emails.begin(), emails.end() ), emails.end());
> 
> for (vector<Email>::const_iterator iter = emails.begin();
> iter != emails.end();
> iter ++)
> {
> fputs((*iter).getEmail(), fout);
> fwrite("\n", 1, 1, fout);
> }
> 
> fclose(fout);
> 
> gettimeofday(&end, 0);
> 
> printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
> - start.tv_usec) / 1000));
> 
> delete[] buffer;
> 
> return 0;
> }
> =====================================================
> It used 680 ms.
> 
> I modified the D version to:
> ====================================================
> import std.stdio;
> import std.string;
> import std.perf;
> 
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
> 
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
> 
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
> 
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
> 
> fread(buffer, BUF_SIZE, 1, fin);
> fclose(fin);
> 
> int[char[]] emails;
> char* begin = buffer;
> char* current = begin;
> char* newline = cast(char*)"\n";
> 
> counter.stop();
> writefln(counter.milliseconds());
> 
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> char[] email = toString(begin);
> if (!(email in emails))
> {
> emails[email] = 0;
> fputs (begin, fout);
> fwrite(newline, 1, 1, fout);
> }
> begin = current + 1;
> }
> ++ current;
> }
> 
> fclose(fout);
> counter.stop();
> 
> writefln(counter.milliseconds());
> 
> delete buffer;
> return 0;
> }
> ====================================================
> 
> But it used 3600 ms... :(
> 
> 
> I don't know how to accelerate it in D. :(
> Can you help me?
> 
> 

Two improvements based on your first D version:
0. Output in a separate loop.
1. Remove the "if(!(email in emails))" check.

Code:

while(!feof(fin)){
    fgets(cast(char*)buffer, READ_SIZE, fin);
    emails[toString(buffer)] = 0;
}
foreach(char[] k, int v; emails)
    fputs(cast(char*)k, fout);
April 03, 2006
From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a buffer for each readline. On my system with a file made of 440,000 lines of (just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the built-in AA's are pretty fast.

Let us know your results (you'll have a lot more AA inserts and email dups). Note you have to dup the string when you insert into your AA.

Try this (dmd v0.150):

import std.stdio;
import std.stream;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3)
{
writefln("Wrong arguments");
return 1;
}

char[8192] bufr;
int[char[]] emails;
char[] email;

PerformanceCounter counter = new PerformanceCounter();
counter.start();

BufferedFile bsi = new BufferedFile(argv[1]);
BufferedFile bso = new BufferedFile(argv[2],FileMode.Out);
while(!bsi.eof)
{
email = bsi.readLine(bufr); // bufr is key to perf.
if (!(email in emails))
{
emails[email.dup] = 0; // Note .dup
}
}

foreach(key; emails.keys.sort)
{
bso.write(cast(ubyte[])key);
bso.write('\n');
}
bso.close;
bsi.close;

counter.stop();
writefln(counter.milliseconds());

return 0;
}

In article <e0qqb3$3gt$1@digitaldaemon.com>, Li Jie says...
>
>At first, thank all my helpers. :)
>
>
>I have some email addresses in a text file, about 780000 lines. There are some repeated lines, about 8%, and I want to remove them.
>
>I write a D version:
>=======================================
>import std.stdio;
>import std.string;
>import std.stdio;
>import std.string;
>import std.perf;
>
>int main(char[][] argv)
>{
>if (argv.length < 3) {
>writefln("Wrong arguments");
>return 1;
>}
>
>const int READ_SIZE = 1024;
>
>FILE* fin = fopen(argv[1], "r");
>FILE* fout = fopen(argv[2], "w");
>char buffer[READ_SIZE];
>int[char[]] emails;
>
>PerformanceCounter counter = new PerformanceCounter();
>counter.start();
>while (!feof(fin)){
>fgets(cast(char*)buffer, READ_SIZE, fin);
>char[] email = toString(cast(char*)buffer);
>if (!(email in emails)){
>emails[toString(buffer)] = 0;
>fputs(cast(char*)email, fout);
>}
>}
>
>fclose(fout);
>fclose(fin);
>counter.stop();
>
>writefln(counter.milliseconds());
>return 0;
>}
>======================================
>
>It used 1080 ms on my computer.
>
>A optimized c++ version:
>======================================
>#include <iostream>
>#include <string>
>#include <fstream>
>#include <iterator>
>#include <sys/time.h>
>#include <vector>
>using namespace std;
>
>
>size_t my_hash (const char* str)
>{
>size_t ret = 0;
>while (*str)
>ret = 11 * ret + *str++;
>return ret;
>}
>
>class Email
>{
>private:
>size_t hash;
>const char* mail;
>friend bool operator < (const Email& lhs, const Email& rhs);
>public:
>Email (const char* mail_)
>: hash(my_hash(mail_)), mail(mail_)
>{
>}
>
>bool operator == (const Email& rhs)
>{
>if (hash == rhs.hash)
>return strcmp(mail, rhs.mail) == 0;
>return false;
>}
>
>const char* getEmail()const
>{
>return mail;
>}
>};
>
>bool operator < (const Email& lhs, const Email& rhs)
>{
>if (lhs.hash == rhs.hash)
>return strcmp(lhs.mail, rhs.mail) < 0;
>return lhs.hash < rhs.hash;
>}
>
>int main(int argc, char** argv)
>{
>if (argc < 3)
>{
>cout << "Wrong arguments" << endl;
>return 1;
>}
>
>FILE* fin = fopen(argv[1], "r");
>if (!fin)
>{
>cout << "Invalid input file" << endl;
>return 2;
>}
>FILE* fout = fopen(argv[2], "w");
>if (!fout)
>{
>fclose(fin);
>cout << "Invalid output file" << endl;
>return 3;
>}
>
>timeval start, end;
>
>const int BUF_SIZE = 20 * 1024 * 1024;
>char* buffer = new char[BUF_SIZE];
>memset(buffer, 0, BUF_SIZE);
>
>gettimeofday(&start, 0);
>vector<Email> emails;
>
>size_t read = fread (buffer, BUF_SIZE, 1, fin);
>char* begin = buffer;
>char* current = buffer;
>
>while (*current != '\0')
>{
>if (*current == '\n')
>{
>*current = '\0';
>emails.push_back(begin);
>begin = current + 1;
>}
>++ current;
>}
>fclose(fin);
>sort(emails.begin(), emails.end());
>emails.erase (unique( emails.begin(), emails.end() ), emails.end());
>
>for (vector<Email>::const_iterator iter = emails.begin();
>iter != emails.end();
>iter ++)
>{
>fputs((*iter).getEmail(), fout);
>fwrite("\n", 1, 1, fout);
>}
>
>fclose(fout);
>
>gettimeofday(&end, 0);
>
>printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
>- start.tv_usec) / 1000));
>
>delete[] buffer;
>
>return 0;
>}
>=====================================================
>It used 680 ms.
>
>I modified the D version to:
>====================================================
>import std.stdio;
>import std.string;
>import std.perf;
>
>int main(char[][] argv)
>{
>if (argv.length < 3) {
>writefln("Wrong arguments");
>return 1;
>}
>
>const int BUF_SIZE = 20 * 1024 * 1024;
>char* buffer = new char[BUF_SIZE];
>
>PerformanceCounter counter = new PerformanceCounter();
>counter.start();
>
>FILE* fin = fopen(argv[1], "r");
>FILE* fout = fopen(argv[2], "w");
>
>fread(buffer, BUF_SIZE, 1, fin);
>fclose(fin);
>
>int[char[]] emails;
>char* begin = buffer;
>char* current = begin;
>char* newline = cast(char*)"\n";
>
>counter.stop();
>writefln(counter.milliseconds());
>
>while (*current != '\0')
>{
>if (*current == '\n')
>{
>*current = '\0';
>char[] email = toString(begin);
>if (!(email in emails))
>{
>emails[email] = 0;
>fputs (begin, fout);
>fwrite(newline, 1, 1, fout);
>}
>begin = current + 1;
>}
>++ current;
>}
>
>fclose(fout);
>counter.stop();
>
>writefln(counter.milliseconds());
>
>delete buffer;
>return 0;
>}
>====================================================
>
>But it used 3600 ms... :(
>
>
>I don't know how to accelerate it in D. :(
>Can you help me?
>
>


April 03, 2006
Wang Zhen wrote:
> 
> 
> Li Jie wrote:
> 
>> At first, thank all my helpers. :)
>>
>>
>> I have some email addresses in a text file, about 780000 lines. There are some
>> repeated lines, about 8%, and I want to remove them.
>>
>> I write a D version:
<snip>

BTW, a simple shell command "sort emails.txt | uniq > output.txt" can get the job done in a few seconds. I assume you went all the trouble writing a D version just for the fun of it. No?
April 03, 2006
Wang Zhen wrote:
 <snip>
> 
> BTW, a simple shell command "sort emails.txt | uniq > output.txt" can get the job done in a few seconds. I assume you went all the trouble writing a D version just for the fun of it. No?

or

sort -u emails.txt
April 04, 2006
In article <e0rcej$10it$1@digitaldaemon.com>, Wang Zhen says...

>Two improvements based on your first D version:
>0. Output in a separate loop.
>1. Remove the "if(!(email in emails))" check.
>
>Code:
>
>while(!feof(fin)){
>     fgets(cast(char*)buffer, READ_SIZE, fin);
>     emails[toString(buffer)] = 0;
>}
>foreach(char[] k, int v; emails)
>     fputs(cast(char*)k, fout);

Thanks.

It takes 1080 ms on my system, it's not fast enough.
I think "cast(char*)(char[])" and "toString" called too much, and it's very
slowly.


April 04, 2006
In article <e0re95$12mg$1@digitaldaemon.com>, Dave says...
>
>
>From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a buffer for each readline. On my system with a file made of 440,000 lines of (just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the built-in AA's are pretty fast.
>
>Let us know your results (you'll have a lot more AA inserts and email dups). Note you have to dup the string when you insert into your AA.
>
>Try this (dmd v0.150):

Thanks Dave.

But it takes ~3400 ms on my system, on windows and gentoo.


April 04, 2006
In article <e0s9oi$1uih$1@digitaldaemon.com>, BCS says...
>
>Wang Zhen wrote:
>  <snip>
>> 
>> BTW, a simple shell command "sort emails.txt | uniq > output.txt" can get the job done in a few seconds. I assume you went all the trouble writing a D version just for the fun of it. No?
>
>or
>
>sort -u emails.txt

Thanks.

I want to compare the speed of C++, D and python.

The best python version only takes 1700 ms, a "stand" c++ version (used set<string>) takes 5000 ms, an optimized c++ version takes 680 ms.  I thinks an optimized D version can faster, shorter and easier to write than the c++ version.


April 04, 2006
In article <e0sltd$2cl7$1@digitaldaemon.com>, Li Jie says...
>
>In article <e0re95$12mg$1@digitaldaemon.com>, Dave says...
>>
>>
>>From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a buffer for each readline. On my system with a file made of 440,000 lines of (just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the built-in AA's are pretty fast.
>>
>>Let us know your results (you'll have a lot more AA inserts and email dups). Note you have to dup the string when you insert into your AA.
>>
>>Try this (dmd v0.150):
>
>Thanks Dave.
>
>But it takes ~3400 ms on my system, on windows and gentoo.
>

There's something screwy going on there... Even if I .dup every line of a 880000 line file with average size e-mails in it, I'm still getting 2x better performance than your C++ version, more in line with this:

http://shootout.alioth.debian.org/gp4/benchmark.php?test=knucleotide&lang=all

which has similiar operations as your test (I/O and hash table update).

I'm using g++ v4.02 and -O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4.

Please send:

- the output from 'dmd -v'
- output from 'll /usr/lib/libphobos.a'.
- what type of system is 'your system'
- how large is the email file in bytes
- which c++ compiler and flags are you using?
- dmd compiler flags for your test.

There are other ways to make the D code faster but a little less straight-forward. See the above benchmark site for those.

Thanks.


« First   ‹ Prev
1 2