From c5cda03125a6d34c179d968011083bceb87976bd Mon Sep 17 00:00:00 2001
From: Michele Calgaro <michele.calgaro@yahoo.it>
Date: Wed, 29 Jan 2025 18:05:37 +0900
Subject: Add support for surrogate pairs to TQChar API.

This relates to issue #162.
The new code is partially taken from Qt4 but with some local rework.

Signed-off-by: Michele Calgaro <michele.calgaro@yahoo.it>
---
 src/3rdparty/libpng/CHANGES     |  2 +-
 src/codecs/tqgb18030codec.cpp   | 30 ++++++++++++----------------
 src/codecs/tqutfcodec.cpp       | 23 +++++++++-------------
 src/kernel/tqfontengine_x11.cpp | 17 ++++++++--------
 src/kernel/tqtextengine.cpp     |  3 +--
 src/tools/tqstring.cpp          | 22 ++++++++-------------
 src/tools/tqstring.h            | 43 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 82 insertions(+), 58 deletions(-)

diff --git a/src/3rdparty/libpng/CHANGES b/src/3rdparty/libpng/CHANGES
index d151a41c7..eea6916ee 100644
--- a/src/3rdparty/libpng/CHANGES
+++ b/src/3rdparty/libpng/CHANGES
@@ -828,7 +828,7 @@ version 1.0.8 [July 24, 2000]
 version 1.0.9beta1 [November 10, 2000]
   Fixed typo in scripts/makefile.hpux
   Updated makevms.com in scripts and contrib/* and contrib/* (Martin Zinser)
-  Fixed seqence-point bug in contrib/pngminus/png2pnm (Martin Zinser)
+  Fixed sequence-point bug in contrib/pngminus/png2pnm (Martin Zinser)
   Changed "cdrom.com" in documentation to "libpng.org"
   Revised pnggccrd.c to get it all working, and updated makefile.gcmmx (Greg).
   Changed type of "params" from voidp to png_voidp in png_read|write_png().
diff --git a/src/codecs/tqgb18030codec.cpp b/src/codecs/tqgb18030codec.cpp
index 0ae2fb4ff..d2578dc8e 100644
--- a/src/codecs/tqgb18030codec.cpp
+++ b/src/codecs/tqgb18030codec.cpp
@@ -184,18 +184,16 @@ TQCString TQGb18030Codec::fromUnicode(const TQString& uc, int& lenInOut) const
 	if ( ch.row() == 0x00 && ch.cell() < 0x80 ) {
 	    // ASCII
 	    *cursor++ = ch.cell();
-	} else if ((ch.unicode() & 0xf800) == 0xd800) {
-	    unsigned short high = ch.unicode();
+	} else if (ch.isHighSurrogate()) {
 	    // surrogates area. check for correct encoding
 	    // we need at least one more character, first the high surrogate, then the low one
-	    if (i == l-1 || high >= 0xdc00)
+	    if (i == l-1)
 		*cursor++ = '?';
 	    else {
-		unsigned short low = uc[i+1].unicode();
-		if (low >= 0xdc00 && low <= 0xdfff) {
+		if (uc[i+1].isLowSurrogate()) {
 		    // valid surrogate pair
+		    uint u = TQChar::surrogateToUcs4(uc[i], uc[i + 1]);
 		    ++i;
-		    uint u = (high-0xd800)*0x400+(low-0xdc00)+0x10000;
 		    len = qt_UnicodeToGb18030(u, buf);
 		    if (len >= 2) {
 			for (int j=0; j<len; j++)
@@ -241,15 +239,13 @@ TQString TQGb18030Codec::toUnicode(const char* chars, int len) const
 	    uint u = qt_Gb18030ToUnicode( (const uchar*)(chars + i), clen );
 
 	    if (clen == 2 || clen == 4) {
-		if (u < 0x10000)
+		if (!TQChar::requiresSurrogates(u)) {
 		    result += TQValidChar(u);
+		}
 		else {
 		    // encode into surrogate pair
-		    u -= 0x10000;
-		    unsigned short high = u/0x400 + 0xd800;
-		    unsigned short low = u%0x400 + 0xdc00;
-		    result += TQChar(high);
-		    result += TQChar(low);
+		    result += TQChar(TQChar::highSurrogate(u));
+		    result += TQChar(TQChar::lowSurrogate(u));
 		}
 		i += clen;
 	    } else if (i < len) {
@@ -402,15 +398,13 @@ public:
 		    int clen = 4;
 		    uint u = qt_Gb18030ToUnicode(buf, clen);
 		    if (clen == 4) {
-			if (u < 0x10000)
+			if (!TQChar::requiresSurrogates(u)) {
 			    result += TQValidChar(u);
+			}
 			else {
 			    // encode into surrogate pair
-			    u -= 0x10000;
-			    unsigned short high = u/0x400 + 0xd800;
-			    unsigned short low = u%0x400 + 0xdc00;
-			    result += TQChar(high);
-			    result += TQChar(low);
+			    result += TQChar(TQChar::highSurrogate(u));
+			    result += TQChar(TQChar::lowSurrogate(u));
 			}
 		    } else {
 			result += TQChar::replacement;
diff --git a/src/codecs/tqutfcodec.cpp b/src/codecs/tqutfcodec.cpp
index 1125aa9f3..eba25e505 100644
--- a/src/codecs/tqutfcodec.cpp
+++ b/src/codecs/tqutfcodec.cpp
@@ -64,13 +64,10 @@ TQCString TQUtf8Codec::fromUnicode(const TQString& uc, int& lenInOut) const
  	    if ( u < 0x0800 ) {
 		*cursor++ = 0xc0 | ((uchar) (u >> 6));
  	    } else {
-		if (u >= 0xd800 && u < 0xdc00 && i < l-1) {
-		    unsigned short low = ch[1].unicode();
-		    if (low >= 0xdc00 && low < 0xe000) {
-			++ch;
-			++i;
-			u = (u - 0xd800)*0x400 + (low - 0xdc00) + 0x10000;
-		    }
+		if (ch[0].isHighSurrogate() && i < (l - 1) && ch[1].isLowSurrogate()) {
+		    u = TQChar::surrogateToUcs4(ch[0], ch[1]);
+		    ++ch;
+		    ++i;
 		}
 		if (u > 0xffff) {
 		    // see TQString::fromUtf8() and TQString::utf8() for explanations
@@ -179,16 +176,14 @@ public:
 		    uc = (uc << 6) | (ch & 0x3f);
 		    need--;
 		    if ( !need ) {
-			if (uc > 0xffff) {
+			if (TQChar::requiresSurrogates(uc)) {
 			    // surrogate pair
-			    uc -= 0x10000;
-			    unsigned short high = uc/0x400 + 0xd800;
-			    unsigned short low = uc%0x400 + 0xdc00;
-			    *qch++ = TQChar(high);
-			    *qch++ = TQChar(low);
+			    *qch++ = TQChar(TQChar::highSurrogate(uc));
+			    *qch++ = TQChar(TQChar::lowSurrogate(uc));
 			    headerDone = TRUE;
 			} else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) {
-                            *qch++ = TQChar::replacement;
+			    // overlong sequence, UTF16 surrogate or BOM
+			    *qch++ = TQChar::replacement;
 			} else {
 			    if (headerDone || TQChar(uc) != TQChar::byteOrderMark)
 				*qch++ = uc;
diff --git a/src/kernel/tqfontengine_x11.cpp b/src/kernel/tqfontengine_x11.cpp
index b3461a6ff..47078dea9 100644
--- a/src/kernel/tqfontengine_x11.cpp
+++ b/src/kernel/tqfontengine_x11.cpp
@@ -1531,16 +1531,15 @@ static glyph_t getAdobeCharIndex(XftFont *font, int cmap, uint ucs4)
     return g;
 }
 
-static uint getChar(const TQChar *str, int &i, const int len)
+static uint getUnicode(const TQChar *str, int &i, const int len)
 {
-    uint uc = str[i].unicode();
-    if (uc >= 0xd800 && uc < 0xdc00 && i < len-1) {
-	uint low = str[++i].unicode();
-	if (low >= 0xdc00 && low < 0xe000) {
-	    uc = (uc - 0xd800)*0x400 + (low - 0xdc00) + 0x10000;
-	}
+    if (str[i].isHighSurrogate() && i < (len - 1) && str[i + 1].isLowSurrogate())
+    {
+	++i; // Don't delete this: it is required for correct
+	     // advancement when handling surrogate pairs
+	return TQChar::surrogateToUcs4(str[i - 1], str[i]);
     }
-    return uc;
+    return str[i].unicode();
 }
 
 TQFontEngine::Error TQFontEngineXft::stringToCMap( const TQChar *str, int len, glyph_t *glyphs, advance_t *advances, int *nglyphs, bool mirrored ) const
@@ -1552,7 +1551,7 @@ TQFontEngine::Error TQFontEngineXft::stringToCMap( const TQChar *str, int len, g
 
     int glyph_pos = 0;
     for ( int i = 0; i < len; ++i ) {
-	uint uc = getChar(str, i, len);
+	uint uc = getUnicode(str, i, len);
 	if ( uc == 0xa0 )
 	    uc = 0x20;
 	if ( mirrored )
diff --git a/src/kernel/tqtextengine.cpp b/src/kernel/tqtextengine.cpp
index f50d849cc..05cdbcc13 100644
--- a/src/kernel/tqtextengine.cpp
+++ b/src/kernel/tqtextengine.cpp
@@ -819,8 +819,7 @@ static void calcLineBreaks(const TQString &str, TQCharAttributes *charAttributes
         
         if (category == TQChar::Other_Surrogate) {
             // char stop only on first pair
-            if (uc[i].unicode() >= 0xd800 && uc[i].unicode() < 0xdc00 && i < len-1
-                && uc[i+1].unicode() >= 0xdc00 && uc[i+1].unicode() < 0xe000)
+            if (uc[i].isHighSurrogate() && i < (len - 1) && uc[i + 1].isLowSurrogate())
                 goto nsm;
             // ### correctly handle second surrogate
         }
diff --git a/src/tools/tqstring.cpp b/src/tools/tqstring.cpp
index 318f1aa77..8db00f1cc 100644
--- a/src/tools/tqstring.cpp
+++ b/src/tools/tqstring.cpp
@@ -6016,13 +6016,10 @@ TQCString TQString::utf8() const
  	    if ( u < 0x0800 ) {
 		*cursor++ = 0xc0 | ((uchar) (u >> 6));
  	    } else {
-		if (u >= 0xd800 && u < 0xdc00 && i < l-1) {
-		    unsigned short low = ch[1].unicode();
-		    if (low >= 0xdc00 && low < 0xe000) {
-			++ch;
-			++i;
-			u = (u - 0xd800)*0x400 + (low - 0xdc00) + 0x10000;
-		    }
+		if (ch[0].isHighSurrogate() && i < (l - 1) && ch[1].isLowSurrogate()) {
+		    u = TQChar::surrogateToUcs4(ch[0], ch[1]);
+		    ++ch;
+		    ++i;
 		}
 		if (u > 0xffff) {
 		    // if people are working in utf8, but strings are encoded in eg. latin1, the resulting
@@ -6101,15 +6098,12 @@ TQString TQString::fromUtf8( const char* utf8, int len )
 		uc = (uc << 6) | (ch & 0x3f);
 		need--;
 		if ( !need ) {
-		    if (uc > 0xffff) {
+		    if (TQChar::requiresSurrogates(uc)) {
 			// surrogate pair
-			uc -= 0x10000;
-			unsigned short high = uc/0x400 + 0xd800;
-			unsigned short low = uc%0x400 + 0xdc00;
-			*qch++ = TQChar(high);
-			*qch++ = TQChar(low);
+			*qch++ = TQChar(TQChar::highSurrogate(uc));
+			*qch++ = TQChar(TQChar::lowSurrogate(uc));
 		    } else if (uc < min_uc || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) {
-			// overlong seqence, UTF16 surrogate or BOM
+			// overlong sequence, UTF16 surrogate or BOM
                         i = error;
                         qch = addOne(qch, result);
                         *qch++ = TQChar(0xdbff);
diff --git a/src/tools/tqstring.h b/src/tools/tqstring.h
index 03fcf9459..c29a9c392 100644
--- a/src/tools/tqstring.h
+++ b/src/tools/tqstring.h
@@ -222,6 +222,14 @@ public:
     bool isDigit() const;
     bool isSymbol() const;
 
+    // Surrogate pairs support
+    bool isHighSurrogate() const;
+    bool isLowSurrogate() const;
+    static bool requiresSurrogates(uint ucs4);
+    static ushort highSurrogate(uint ucs4);
+    static ushort lowSurrogate(uint ucs4);
+    static uint surrogateToUcs4(const TQChar &high, const TQChar &low);
+
     uchar cell() const { return ((uchar) ucs & 0xff); }
     uchar row() const { return ((uchar) (ucs>>8)&0xff); }
     void setCell( uchar cell ) { ucs = (ucs & 0xff00) + cell; }
@@ -313,6 +321,36 @@ inline TQChar::TQChar( int rc ) : ucs( (ushort) (rc & 0xffff) )
 {
 }
 
+inline bool TQChar::isHighSurrogate() const
+{
+    return ((ucs & 0xfc00) == 0xd800);
+}
+
+inline bool TQChar::isLowSurrogate() const
+{
+    return ((ucs & 0xfc00) == 0xdc00);
+}
+
+inline bool TQChar::requiresSurrogates(uint ucs4)
+{
+    return (ucs4 >= 0x10000);
+}
+
+inline ushort TQChar::highSurrogate(uint ucs4)
+{
+    return ushort(((ucs4 - 0x10000) >> 10)) | 0xd800;
+}
+
+inline ushort TQChar::lowSurrogate(uint ucs4)
+{
+    return ushort(ucs4 & 0x03FF) | 0xdc00;
+}
+
+inline uint TQChar::surrogateToUcs4(const TQChar &high, const TQChar &low)
+{
+    return (uint(high.ucs & 0x03FF) << 10) | (low.ucs & 0x03FF) | 0x10000;
+}
+
 inline bool operator==( char ch, TQChar c )
 {
     return ((uchar) ch) == c.ucs;
@@ -806,6 +844,11 @@ public:
     bool isNumber() const { return s.constref(p).isNumber(); }
     bool isLetterOrNumber() { return s.constref(p).isLetterOrNumber(); }
     bool isDigit() const { return s.constref(p).isDigit(); }
+    bool isSymbol() const { return s.constref(p).isSymbol(); }
+
+    // Surrogate pairs support
+    bool isHighSurrogate() const { return s.constref(p).isHighSurrogate(); }
+    bool isLowSurrogate() const { return s.constref(p).isLowSurrogate(); }
 
     int digitValue() const { return s.constref(p).digitValue(); }
     TQChar lower() const { return s.constref(p).lower(); }
-- 
cgit v1.2.3

