Jump to page: 1 2
Thread overview
[Issue 5221] New: entity.c: Merge Walter's list with Thomas'
Nov 16, 2010
Iain Buclaw
Nov 16, 2010
Iain Buclaw
Nov 16, 2010
Iain Buclaw
Nov 26, 2010
Iain Buclaw
Jan 28, 2011
Aziz Köksal
Jan 28, 2011
Don
Jan 29, 2011
Iain Buclaw
Jan 29, 2011
Iain Buclaw
Jan 29, 2011
Iain Buclaw
Jan 29, 2011
Iain Buclaw
Jan 29, 2011
Iain Buclaw
Jan 29, 2011
Aziz Köksal
Jan 29, 2011
Iain Buclaw
Jan 30, 2011
Iain Buclaw
Jan 30, 2011
Iain Buclaw
Jan 31, 2011
Don
Jan 31, 2011
Iain Buclaw
Jan 31, 2011
Don
Feb 04, 2011
Aziz Köksal
Feb 06, 2011
Don
November 16, 2010
http://d.puremagic.com/issues/show_bug.cgi?id=5221

           Summary: entity.c: Merge Walter's list with Thomas'
           Product: D
           Version: D1 & D2
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: normal
          Priority: P2
         Component: DMD
        AssignedTo: nobody@puremagic.com
        ReportedBy: ibuclaw@ubuntu.com


--- Comment #0 from Iain Buclaw <ibuclaw@ubuntu.com> 2010-11-16 05:35:25 PST ---
Created an attachment (id=815)
Merge Walter's list with Thomas'

Thought it might prove useful to DMD to merge the lists, and remove that "dumb, slow linear search" of theirs in entity.c

Regards

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
November 16, 2010
http://d.puremagic.com/issues/show_bug.cgi?id=5221



--- Comment #1 from Iain Buclaw <ibuclaw@ubuntu.com> 2010-11-16 05:45:59 PST ---
Random examples of tests that fail on DMD:

static assert('\&check;'==10003);
static assert('\&lsim;'==8818);
static assert('\&numero;'==8470);
static assert('\&urcorn;'==8989);
static assert('\&Zdot;'==379);


Regards

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
November 16, 2010
http://d.puremagic.com/issues/show_bug.cgi?id=5221



--- Comment #2 from Iain Buclaw <ibuclaw@ubuntu.com> 2010-11-16 06:07:53 PST ---
(From update of attachment 815)
diff -ur src.orig/entity.c src/entity.c
--- src.orig/entity.c    2010-03-31 01:26:18.000000000 +0100
+++ src/entity.c    2010-11-16 14:01:58.423055202 +0000
@@ -9,6 +9,7 @@


 #include <string.h>
+#include <ctype.h>

 /*********************************************
  * Convert from named entity to its encoding.
@@ -23,7 +24,6 @@
     unsigned short value;
 };

-#if IN_GCC
 static NameId namesA[]={
         "Aacgr",        0x0386,
         "aacgr",        0x03AC,
@@ -42,7 +42,9 @@
         "agr",          0x03B1,
         "Agrave",       0x00C0,
         "agrave",       0x00E0,
+        "alefsym",      0x2135,
         "aleph",        0x2135,
+        "Alpha",        0x0391,
         "alpha",        0x03B1,
         "Amacr",        0x0100,
         "amacr",        0x0101,
@@ -76,9 +78,11 @@
         "bcong",        0x224C,
         "Bcy",          0x0411,
         "bcy",          0x0431,
+        "bdquo",        0x201E,
         "becaus",       0x2235,
         "bepsi",        0x220D,
         "bernou",       0x212C,
+        "Beta",         0x0392,
         "beta",         0x03B2,
         "beth",         0x2136,
         "Bgr",          0x0392,
@@ -162,6 +166,7 @@
         "CHcy",         0x0427,
         "chcy",         0x0447,
         "check",        0x2713,
+        "Chi",          0x03A7,
         "chi",          0x03C7,
         "cir",          0x25CB,
         "circ",         0x005E,
@@ -178,6 +183,7 @@
         "coprod",       0x2210,
         "copy",         0x00A9,
         "copysr",       0x2117,
+        "crarr",        0x21B5,
         "cross",        0x2717,
         "cuepr",        0x22DE,
         "cuesc",        0x22DF,
@@ -281,17 +287,21 @@
         "Eogon",        0x0118,
         "eogon",        0x0119,
         "epsi",         0x220A,
+        "Epsilon",      0x0395,
+        "epsilon",      0x03B5,
         "epsis",        0x220A,
         "epsiv",        0x03B5,
         "equals",       0x003D,
         "equiv",        0x2261,
         "erDot",        0x2253,
         "esdot",        0x2250,
+        "Eta",          0x0397,
         "eta",          0x03B7,
         "ETH",          0x00D0,
         "eth",          0x00F0,
         "Euml",         0x00CB,
         "euml",         0x00EB,
+        "euro",         0x20AC,
         "excl",         0x0021,
         "exist",        0x2203,
         NULL,           0
@@ -325,6 +335,7 @@
         "frac56",       0x215A,
         "frac58",       0x215D,
         "frac78",       0x215E,
+        "frasl",        0x2044,
         "frown",        0x2322,
         NULL,           0
 };
@@ -425,6 +436,7 @@
         "iocy",         0x0451,
         "Iogon",        0x012E,
         "iogon",        0x012F,
+        "Iota",         0x0399,
         "iota",         0x03B9,
         "iquest",       0x00BF,
         "isin",         0x220A,
@@ -450,6 +462,7 @@
 };

 static NameId namesK[]={
+        "Kappa",        0x039A,
         "kappa",        0x03BA,
         "kappav",       0x03F0,
         "Kcedil",       0x0136,
@@ -523,7 +536,9 @@
         "lozf",         0x2726,
         "lpar",         0x0028,
         "lrarr2",       0x21C6,
+        "lrm",          0x200E,
         "lrhar2",       0x21CB,
+        "lsaquo",       0x2039,
         "lsh",          0x21B0,
         "lsim",         0x2272,
         "lsqb",         0x005B,
@@ -561,6 +576,7 @@
         "mldr",         0x2026,
         "mnplus",       0x2213,
         "models",       0x22A7,
+        "Mu",           0x039C,
         "mu",           0x03BC,
         "mumap",        0x22B8,
         NULL,           0
@@ -573,8 +589,7 @@
         "nap",          0x2249,
         "napos",        0x0149,
         "natur",        0x266E,
-//      "nbsp",         0x00A0,
-        "nbsp",         32,    // make non-breaking space appear as space
+        "nbsp",         0x00A0,
         "Ncaron",       0x0147,
         "ncaron",       0x0148,
         "Ncedil",       0x0145,
@@ -631,6 +646,7 @@
         "nsupE",        0x2289,
         "Ntilde",       0x00D1,
         "ntilde",       0x00F1,
+        "Nu",           0x039D,
         "nu",           0x03BD,
         "num",          0x0023,
         "numero",       0x2116,
@@ -671,10 +687,13 @@
         "ohgr",         0x03C9,
         "ohm",          0x2126,
         "olarr",        0x21BA,
+        "oline",        0x203E,
         "Omacr",        0x014C,
         "omacr",        0x014D,
         "Omega",        0x03A9,
         "omega",        0x03C9,
+        "Omicron",      0x039F,
+        "omicron",      0x03BF,
         "ominus",       0x2296,
         "oplus",        0x2295,
         "or",           0x2228,
@@ -709,6 +728,7 @@
         "PHgr",         0x03A6,
         "phgr",         0x03C6,
         "Phi",          0x03A6,
+        "phi",          0x03C6,
         "phis",         0x03C6,
         "phiv",         0x03D5,
         "phmmat",       0x2133,
@@ -780,13 +800,16 @@
         "rgr",          0x03C1,
         "rhard",        0x21C1,
         "rharu",        0x21C0,
+        "Rho",          0x03A1,
         "rho",          0x03C1,
         "rhov",         0x03F1,
         "ring",         0x02DA,
         "rlarr2",       0x21C4,
         "rlhar2",       0x21CC,
+        "rlm",          0x200F,
         "rpar",         0x0029,
         "rpargt",       0xE291,
+        "rsaquo",       0x203A,
         "rsh",          0x21B1,
         "rsqb",         0x005D,
         "rsquo",        0x2019,
@@ -804,6 +827,7 @@
         "Sacute",       0x015A,
         "sacute",       0x015B,
         "samalg",       0x2210,
+        "sbquo",        0x201A,
         "sbsol",        0xFE68,
         "sc",           0x227B,
         "scap",         0x227F,
@@ -839,6 +863,7 @@
         "shy",          0x00AD,
         "Sigma",        0x03A3,
         "sigma",        0x03C3,
+        "sigmaf",       0x03C2,
         "sigmav",       0x03C2,
         "sim",          0x223C,
         "sime",         0x2243,
@@ -886,6 +911,7 @@

 static NameId namesT[]={
         "target",       0x2316,
+        "Tau",          0x03A4,
         "tau",          0x03C4,
         "Tcaron",       0x0164,
         "tcaron",       0x0165,
@@ -899,7 +925,9 @@
         "tgr",          0x03C4,
         "there4",       0x2234,
         "Theta",        0x0398,
+        "theta",        0x03B8,
         "thetas",       0x03B8,
+        "thetasym",     0x03D1,
         "thetav",       0x03D1,
         "THgr",         0x0398,
         "thgr",         0x03B8,
@@ -961,8 +989,11 @@
         "Uogon",        0x0172,
         "uogon",        0x0173,
         "uplus",        0x228E,
+        "Upsi",         0x03A5,
         "upsi",         0x03C5,
-        "Upsi",         0x03D2,
+        "upsih",        0x03D2,
+        "Upsilon",      0x03A5,
+        "upsilon",      0x03C5,
         "urcorn",       0x231D,
         "urcrop",       0x230E,
         "Uring",        0x016E,
@@ -1052,11 +1083,14 @@
         "zcy",          0x0437,
         "Zdot",         0x017B,
         "zdot",         0x017C,
+        "Zeta",         0x0396,
         "zeta",         0x03B6,
         "Zgr",          0x0396,
         "zgr",          0x03B6,
         "ZHcy",         0x0416,
         "zhcy",         0x0436,
+        "zwj",          0x200D,
+        "zwnj",         0x200C,
         NULL, 0
 };

@@ -1070,297 +1104,17 @@
 int HtmlNamedEntity(unsigned char *p, int length)
 {
     int tableIndex = tolower(*p) - 'a';
-    if (tableIndex >= 0 && tableIndex < 26) {
+    if (tableIndex >= 0 && tableIndex < 26)
+    {
         NameId* names = namesTable[tableIndex];
         int i;

-        for (i = 0; names[i].name; i++){
-                if (strncmp(names[i].name, (char *)p, length) == 0){
-                        return names[i].value;
-                }
+        for (i = 0; names[i].name; i++)
+        {
+            if (strncmp(names[i].name, (char *)p, length) == 0)
+                return names[i].value;
         }
     }
-    error("unrecognized character entity \"%.*s\"", length, p);
-    return -1;
-}
-
-#else //TODO: Merge Walter's list with Thomas'
-
-static NameId names[] =
-{
-    // Entities
-    "quot",     34,
-    "amp",      38,
-    "lt",       60,
-    "gt",       62,
-
-    "OElig",    338,
-    "oelig",    339,
-    "Scaron",   352,
-    "scaron",   353,
-    "Yuml",     376,
-    "circ",     710,
-    "tilde",    732,
-    "ensp",     8194,
-    "emsp",     8195,
-    "thinsp",   8201,
-    "zwnj",     8204,
-    "zwj",      8205,
-    "lrm",      8206,
-    "rlm",      8207,
-    "ndash",    8211,
-    "mdash",    8212,
-    "lsquo",    8216,
-    "rsquo",    8217,
-    "sbquo",    8218,
-    "ldquo",    8220,
-    "rdquo",    8221,
-    "bdquo",    8222,
-    "dagger",   8224,
-    "Dagger",   8225,
-    "permil",   8240,
-    "lsaquo",   8249,
-    "rsaquo",   8250,
-    "euro",     8364,
-
-    // Latin-1 (ISO-8859-1) Entities
-    "nbsp",     160,
-    "iexcl",    161,
-    "cent",     162,
-    "pound",    163,
-    "curren",   164,
-    "yen",      165,
-    "brvbar",   166,
-    "sect",     167,
-    "uml",      168,
-    "copy",     169,
-    "ordf",     170,
-    "laquo",    171,
-    "not",      172,
-    "shy",      173,
-    "reg",      174,
-    "macr",     175,
-    "deg",      176,
-    "plusmn",   177,
-    "sup2",     178,
-    "sup3",     179,
-    "acute",    180,
-    "micro",    181,
-    "para",     182,
-    "middot",   183,
-    "cedil",    184,
-    "sup1",     185,
-    "ordm",     186,
-    "raquo",    187,
-    "frac14",   188,
-    "frac12",   189,
-    "frac34",   190,
-    "iquest",   191,
-    "Agrave",   192,
-    "Aacute",   193,
-    "Acirc",    194,
-    "Atilde",   195,
-    "Auml",     196,
-    "Aring",    197,
-    "AElig",    198,
-    "Ccedil",   199,
-    "Egrave",   200,
-    "Eacute",   201,
-    "Ecirc",    202,
-    "Euml",     203,
-    "Igrave",   204,
-    "Iacute",   205,
-    "Icirc",    206,
-    "Iuml",     207,
-    "ETH",      208,
-    "Ntilde",   209,
-    "Ograve",   210,
-    "Oacute",   211,
-    "Ocirc",    212,
-    "Otilde",   213,
-    "Ouml",     214,
-    "times",    215,
-    "Oslash",   216,
-    "Ugrave",   217,
-    "Uacute",   218,
-    "Ucirc",    219,
-    "Uuml",     220,
-    "Yacute",   221,
-    "THORN",    222,
-    "szlig",    223,
-    "agrave",   224,
-    "aacute",   225,
-    "acirc",    226,
-    "atilde",   227,
-    "auml",     228,
-    "aring",    229,
-    "aelig",    230,
-    "ccedil",   231,
-    "egrave",   232,
-    "eacute",   233,
-    "ecirc",    234,
-    "euml",     235,
-    "igrave",   236,
-    "iacute",   237,
-    "icirc",    238,
-    "iuml",     239,
-    "eth",      240,
-    "ntilde",   241,
-    "ograve",   242,
-    "oacute",   243,
-    "ocirc",    244,
-    "otilde",   245,
-    "ouml",     246,
-    "divide",   247,
-    "oslash",   248,
-    "ugrave",   249,
-    "uacute",   250,
-    "ucirc",    251,
-    "uuml",     252,
-    "yacute",   253,
-    "thorn",    254,
-    "yuml",     255,
-
-        // Symbols and Greek letter entities
-    "fnof",     402,
-    "Alpha",    913,
-    "Beta",     914,
-    "Gamma",    915,
-    "Delta",    916,
-    "Epsilon",  917,
-    "Zeta",     918,
-    "Eta",      919,
-    "Theta",    920,
-    "Iota",     921,
-    "Kappa",    922,
-    "Lambda",   923,
-    "Mu",       924,
-    "Nu",       925,
-    "Xi",       926,
-    "Omicron",  927,
-    "Pi",       928,
-    "Rho",      929,
-    "Sigma",    931,
-    "Tau",      932,
-    "Upsilon",  933,
-    "Phi",      934,
-    "Chi",      935,
-    "Psi",      936,
-    "Omega",    937,
-    "alpha",    945,
-    "beta",     946,
-    "gamma",    947,
-    "delta",    948,
-    "epsilon",  949,
-    "zeta",     950,
-    "eta",      951,
-    "theta",    952,
-    "iota",     953,
-    "kappa",    954,
-    "lambda",   955,
-    "mu",       956,
-    "nu",       957,
-    "xi",       958,
-    "omicron",  959,
-    "pi",       960,
-    "rho",      961,
-    "sigmaf",   962,
-    "sigma",    963,
-    "tau",      964,
-    "upsilon",  965,
-    "phi",      966,
-    "chi",      967,
-    "psi",      968,
-    "omega",    969,
-    "thetasym", 977,
-    "upsih",    978,
-    "piv",      982,
-    "bull",     8226,
-    "hellip",   8230,
-    "prime",    8242,
-    "Prime",    8243,
-    "oline",    8254,
-    "frasl",    8260,
-    "weierp",   8472,
-    "image",    8465,
-    "real",     8476,
-    "trade",    8482,
-    "alefsym",  8501,
-    "larr",     8592,
-    "uarr",     8593,
-    "rarr",     8594,
-    "darr",     8595,
-    "harr",     8596,
-    "crarr",    8629,
-    "lArr",     8656,
-    "uArr",     8657,
-    "rArr",     8658,
-    "dArr",     8659,
-    "hArr",     8660,
-    "forall",   8704,
-    "part",     8706,
-    "exist",    8707,
-    "empty",    8709,
-    "nabla",    8711,
-    "isin",     8712,
-    "notin",    8713,
-    "ni",       8715,
-    "prod",     8719,
-    "sum",      8721,
-    "minus",    8722,
-    "lowast",   8727,
-    "radic",    8730,
-    "prop",     8733,
-    "infin",    8734,
-    "ang",      8736,
-    "and",      8743,
-    "or",       8744,
-    "cap",      8745,
-    "cup",      8746,
-    "int",      8747,
-    "there4",   8756,
-    "sim",      8764,
-    "cong",     8773,
-    "asymp",    8776,
-    "ne",       8800,
-    "equiv",    8801,
-    "le",       8804,
-    "ge",       8805,
-    "sub",      8834,
-    "sup",      8835,
-    "nsub",     8836,
-    "sube",     8838,
-    "supe",     8839,
-    "oplus",    8853,
-    "otimes",   8855,
-    "perp",     8869,
-    "sdot",     8901,
-    "lceil",    8968,
-    "rceil",    8969,
-    "lfloor",   8970,
-    "rfloor",   8971,
-    "lang",     9001,
-    "rang",     9002,
-    "loz",      9674,
-    "spades",   9824,
-    "clubs",    9827,
-    "hearts",   9829,
-    "diams",    9830,
-};
-
-int HtmlNamedEntity(unsigned char *p, int length)
-{
-    int i;
-
-    // BUG: this is a dumb, slow linear search
-    for (i = 0; i < sizeof(names) / sizeof(names[0]); i++)
-    {
-        // Entries are case sensitive
-        if (memcmp(names[i].name, (char *)p, length) == 0 &&
-            !names[i].name[length])
-            return names[i].value;
-    }
     return -1;
 }

-#endif

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
November 26, 2010
http://d.puremagic.com/issues/show_bug.cgi?id=5221



--- Comment #3 from Iain Buclaw <ibuclaw@ubuntu.com> 2010-11-26 11:42:28 PST ---
Created an attachment (id=834)
Updated merge.

Yikes! I didn't know my last update was going to do *that*.

Sorry for any noise, here's an updated patch against the svn, adds some bits, corrects some mistakes in Thomas' list.

Checked and tested against the testsuite. =)

Regards

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
January 28, 2011
http://d.puremagic.com/issues/show_bug.cgi?id=5221


Aziz Köksal <aziz.koeksal@gmail.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |aziz.koeksal@gmail.com


--- Comment #4 from Aziz Köksal <aziz.koeksal@gmail.com> 2011-01-28 14:25:56 PST ---
I researched this issue with named HTML entities and found several, different lists out there.

I think the following list is the most complete and most accurate one:

http://www.w3.org/2003/entities/2007/w3centities-f.ent

Please consider mentioning this in the language specification.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
January 28, 2011
http://d.puremagic.com/issues/show_bug.cgi?id=5221


Don <clugdbug@yahoo.com.au> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |clugdbug@yahoo.com.au


--- Comment #5 from Don <clugdbug@yahoo.com.au> 2011-01-28 14:36:53 PST ---
(In reply to comment #4)
> I researched this issue with named HTML entities and found several, different lists out there.
> 
> I think the following list is the most complete and most accurate one:
> 
> http://www.w3.org/2003/entities/2007/w3centities-f.ent
> 
> Please consider mentioning this in the language specification.

A few hours ago I merged this patch into my fork of dmd. Complete source is here:

https://github.com/donc/dmd/blob/master/src/entity.c

Would be great if you or someone else could compare that list, to the one you've just posted.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
January 29, 2011
http://d.puremagic.com/issues/show_bug.cgi?id=5221



--- Comment #6 from Iain Buclaw <ibuclaw@ubuntu.com> 2011-01-28 16:20:41 PST ---
(In reply to comment #5)
> (In reply to comment #4)
> > I researched this issue with named HTML entities and found several, different lists out there.
> > 
> > I think the following list is the most complete and most accurate one:
> > 
> > http://www.w3.org/2003/entities/2007/w3centities-f.ent
> > 
> > Please consider mentioning this in the language specification.
> 
> A few hours ago I merged this patch into my fork of dmd. Complete source is here:
> 
> https://github.com/donc/dmd/blob/master/src/entity.c
> 
> Would be great if you or someone else could compare that list, to the one you've just posted.

There are quite a lot of additions, and the odd difference inbetween. I can do an update, though I guess it depends on how much you want to put in.

There are entities to whom's value is large than a unsigned short. eg:

"b.nu",             0x1D6CE,
"b.Omega",          0x1D6C0,
"b.omega",          0x1D6DA,
"Bopf",             0x1D539,
"bopf",             0x1D553,

Which then leads to question #2, does the parser allow '\&b.nu;' ?

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
January 29, 2011
http://d.puremagic.com/issues/show_bug.cgi?id=5221



--- Comment #7 from Iain Buclaw <ibuclaw@ubuntu.com> 2011-01-28 16:21:55 PST ---
Answered that myself, no it does not. :)

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
January 29, 2011
http://d.puremagic.com/issues/show_bug.cgi?id=5221



--- Comment #8 from Iain Buclaw <ibuclaw@ubuntu.com> 2011-01-29 02:39:02 PST ---
Created an attachment (id=887)
new entity.c source based off new link

Attaching new source file based off link above (made rather swiftly using vi
macros).

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
January 29, 2011
http://d.puremagic.com/issues/show_bug.cgi?id=5221



--- Comment #9 from Iain Buclaw <ibuclaw@ubuntu.com> 2011-01-29 02:40:57 PST ---
Created an attachment (id=888)
diff between file and donc/dmd/src/entity.c

Attaching diff between new file and https://github.com/donc/dmd/blob/master/src/entity.c

Used 'diff -wur' to ignore whitespace differences.

Regards

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
« First   ‹ Prev
1 2