Thread overview
A third example use of C macros that has not been discussed
Feb 01, 2003
Paul Sheer
Feb 01, 2003
Andy Friesen
Feb 02, 2003
Mike Wynn
February 01, 2003
There are many situations where you want to reimplement
a piece of code several times, with a different type
for each. This can happen for optimization, or for
marshalling functions. This is the strongest reason
for support of a macro preprocessor.

This example shows an optimized memcpy function that
copies in words at a time, properly accounting for
possible alignment differences on processors that
do not support non-aligned word stores.

It is eligantly done with macros of course.

-paul

--------

/* this must be set to the most efficient copying type - usually
unsigned long: */
typedef unsigned long cpy_t;

#define word_copy(t,d,s,count)					\
    do {							\
	unsigned int c;						\
	char *d8 = (char *) (d);				\
	char *s8 = (char *) (s);				\
	register t a0, a1, *dst, *src;				\
	c = (unsigned long) (d) & (sizeof (t) - 1);		\
	while (count && (c & (sizeof (t) - 1)))			\
	     (*d8++ = *s8++), count--, c++;			\
	dst = (t *) d8;						\
	src = (t *) s8;						\
	while (count >= (sizeof (t)) * 2) {			\
	    a0 = src[0];					\
	    a1 = src[1];					\
	    count -= (sizeof (t)) * 2;				\
	    dst[0] = a0;					\
	    dst[1] = a1;					\
	    src += 2;						\
	    dst += 2;						\
	}							\
	while (count >= (sizeof (t))) {				\
	    *dst++ = *src++;					\
	    count -= sizeof (t);				\
	}							\
	d8 = (char *) dst;					\
	s8 = (char *) src;					\
	while (count--)						\
	    *d8++ = *s8++;					\
    } while (0)

#define byte_copy(t,d,s,count)					\
    do {							\
	char *d8 = (char *) (d);				\
	char *s8 = (char *) (s);				\
	while (count--)						\
	    *d8++ = *s8++;					\
    } while (0)

void *memcpy (void *_dest, const void *_src, size_t count)
{
    unsigned int f;
/* check alignment */
    f = sizeof (cpy_t);
    while ((((unsigned long) _src) & (f - 1)) !=
                (((unsigned long) _dest & (f - 1))))
	f >>= 1;
    switch (f) {
    case 8:
	word_copy (u_int64_t, _dest, _src, count);
	break;
    case 4:
	word_copy (u_int32_t, _dest, _src, count);
	break;
    case 2:
	word_copy (u_int16_t, _dest, _src, count);
	break;
    case 1:
	byte_copy (u_int8_t, _dest, _src, count);
	break;
    }
    return (void *) _dest;
}


February 01, 2003
Maybe I'm missing something, but it seems to me that you could do that with templates pretty easily.  In so doing, you get a bit of typesafety, and you avoid arguments from being evaluated more than once.

    template CopyLoop(T)
    {
        void word_copy(T* dest, T* src, int count)
        {
            do
            {
                uint c;
                char *d8 = (char *) (dest);
                char *s8 = (char *) (src);
                register t a0, a1, *dst, *src;
                c = (ulong) (dest) & (sizeof (t) - 1);
                while (count && (c & (sizeof (t) - 1)))
                     (*d8++ = *s8++), count--, c++;
                dst = (t *) d8;
                src = (t *) s8;
                while (count >= (sizeof (T)) * 2)
                {
                    a0 = src[0];
                    a1 = src[1];
                    count -= (sizeof (T)) * 2;
                    dst[0] = a0;
                    dst[1] = a1;
                    src += 2;
                    dst += 2;
                }
                while (count >= (sizeof (T)))
                {
                    *dst++ = *src++;
                    count -= sizeof (T);
                }
                d8 = (char *) dst;
                s8 = (char *) src;
                while (count--)
                    *d8++ = *s8++;
            } while (0);
        }
    }


Paul Sheer wrote:
> There are many situations where you want to reimplement
> a piece of code several times, with a different type
> for each. This can happen for optimization, or for
> marshalling functions. This is the strongest reason
> for support of a macro preprocessor.
> 
> This example shows an optimized memcpy function that
> copies in words at a time, properly accounting for
> possible alignment differences on processors that
> do not support non-aligned word stores.
> 
> It is eligantly done with macros of course.
> 
> -paul
> 
> --------
> 
> /* this must be set to the most efficient copying type - usually
> unsigned long: */
> typedef unsigned long cpy_t;
> 
> #define word_copy(t,d,s,count)					\
>     do {							\
> 	unsigned int c;						\
> 	char *d8 = (char *) (d);				\
> 	char *s8 = (char *) (s);				\
> 	register t a0, a1, *dst, *src;				\
> 	c = (unsigned long) (d) & (sizeof (t) - 1);		\
> 	while (count && (c & (sizeof (t) - 1)))			\
> 	     (*d8++ = *s8++), count--, c++;			\
> 	dst = (t *) d8;						\
> 	src = (t *) s8;						\
> 	while (count >= (sizeof (t)) * 2) {			\
> 	    a0 = src[0];					\
> 	    a1 = src[1];					\
> 	    count -= (sizeof (t)) * 2;				\
> 	    dst[0] = a0;					\
> 	    dst[1] = a1;					\
> 	    src += 2;						\
> 	    dst += 2;						\
> 	}							\
> 	while (count >= (sizeof (t))) {				\
> 	    *dst++ = *src++;					\
> 	    count -= sizeof (t);				\
> 	}							\
> 	d8 = (char *) dst;					\
> 	s8 = (char *) src;					\
> 	while (count--)						\
> 	    *d8++ = *s8++;					\
>     } while (0)
> 
> #define byte_copy(t,d,s,count)					\
>     do {							\
> 	char *d8 = (char *) (d);				\
> 	char *s8 = (char *) (s);				\
> 	while (count--)						\
> 	    *d8++ = *s8++;					\
>     } while (0)
> 
> void *memcpy (void *_dest, const void *_src, size_t count)
> {
>     unsigned int f;
> /* check alignment */
>     f = sizeof (cpy_t);
>     while ((((unsigned long) _src) & (f - 1)) !=                 (((unsigned long) _dest & (f - 1))))
> 	f >>= 1;
>     switch (f) {
>     case 8:
> 	word_copy (u_int64_t, _dest, _src, count);
> 	break;
>     case 4:
> 	word_copy (u_int32_t, _dest, _src, count);
> 	break;
>     case 2:
> 	word_copy (u_int16_t, _dest, _src, count);
> 	break;
>     case 1:
> 	byte_copy (u_int8_t, _dest, _src, count);
> 	break;
>     }
>     return (void *) _dest;
> }
> 
> 

February 02, 2003
IMHO:
the compiler should generate an optimised memcpy from the  src

memcpy( foo, bar, len );

(inlined if optimised for speed, either way the fastest for the platform and the cpu's supported instruction set).

and you've not put a duff's device in there, (they're legal in D) tight
loops kill performance.
and on some architectures unaligned int reads are allowed and less expencive
than 4 byte reads. I'm sure less expensive than 4 byte reads and 4 branches.

Mike.


"Paul Sheer" <psheer@icon.co.za> wrote in message news:b1gs7d$2i9c$1@digitaldaemon.com...
>
> There are many situations where you want to reimplement
> a piece of code several times, with a different type
> for each. This can happen for optimization, or for
> marshalling functions. This is the strongest reason
> for support of a macro preprocessor.
>
> This example shows an optimized memcpy function that
> copies in words at a time, properly accounting for
> possible alignment differences on processors that
> do not support non-aligned word stores.
>
> It is eligantly done with macros of course.
>
> -paul
>
> --------
>
> /* this must be set to the most efficient copying type - usually
> unsigned long: */
> typedef unsigned long cpy_t;
>
> #define word_copy(t,d,s,count) \
>     do { \
> unsigned int c; \
> char *d8 = (char *) (d); \
> char *s8 = (char *) (s); \
> register t a0, a1, *dst, *src; \
> c = (unsigned long) (d) & (sizeof (t) - 1); \
> while (count && (c & (sizeof (t) - 1))) \
>      (*d8++ = *s8++), count--, c++; \
> dst = (t *) d8; \
> src = (t *) s8; \
> while (count >= (sizeof (t)) * 2) { \
>     a0 = src[0]; \
>     a1 = src[1]; \
>     count -= (sizeof (t)) * 2; \
>     dst[0] = a0; \
>     dst[1] = a1; \
>     src += 2; \
>     dst += 2; \
> } \
> while (count >= (sizeof (t))) { \
>     *dst++ = *src++; \
>     count -= sizeof (t); \
> } \
> d8 = (char *) dst; \
> s8 = (char *) src; \
> while (count--) \
>     *d8++ = *s8++; \
>     } while (0)
>
> #define byte_copy(t,d,s,count) \
>     do { \
> char *d8 = (char *) (d); \
> char *s8 = (char *) (s); \
> while (count--) \
>     *d8++ = *s8++; \
>     } while (0)
>
> void *memcpy (void *_dest, const void *_src, size_t count)
> {
>     unsigned int f;
> /* check alignment */
>     f = sizeof (cpy_t);
>     while ((((unsigned long) _src) & (f - 1)) !=
>                 (((unsigned long) _dest & (f - 1))))
> f >>= 1;
>     switch (f) {
>     case 8:
> word_copy (u_int64_t, _dest, _src, count);
> break;
>     case 4:
> word_copy (u_int32_t, _dest, _src, count);
> break;
>     case 2:
> word_copy (u_int16_t, _dest, _src, count);
> break;
>     case 1:
> byte_copy (u_int8_t, _dest, _src, count);
> break;
>     }
>     return (void *) _dest;
> }
>
>