Eigenstate: myrddin-dev mailing list

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 2/2] Implement bygrapheme()


---
 lib/std/test/utf.myr | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/std/utf.myr      | 25 ++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/lib/std/test/utf.myr b/lib/std/test/utf.myr
index 9f73f5ae..712081da 100644
--- a/lib/std/test/utf.myr
+++ b/lib/std/test/utf.myr
@@ -28,4 +28,77 @@ const main = {
 		"wrong width of runes")
 	std.assert(std.strcellwidth("𒀸 𒌋𒅗 𒆷 𒂅𒌒 𒍜 𒀭𒉌𒄿 𒈗 𒁁𒉌 𒋬") == 22, \
 		"wrong width of Cuneiform")
+
+	/* bygrapheme() */
+	var s = "a史cЯx̀̀̀̀̀yz̉"
+	var sub, rest
+
+	(sub, rest) = std.bygrapheme(s)
+	std.assert(std.streq(sub, "a"), "didn't get \"a\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "史"), "didn't get \"史\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "c"), "didn't get \"c\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "Я"), "didn't get \"Я\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "x̀̀̀̀̀"), "didn't get \"x̀̀̀̀̀\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "y"), "didn't get \"y\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "z̉"), "didn't get \"z̉\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(sub.len == 0, "didn't get \"\" as last grapheme")
+
+
+	/* with excessive combiners */
+	s = "c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅo̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚s̓̍̍̄͏̖̞̟̱́͡͡͝"
+
+	(sub, rest) = std.bygrapheme(s)
+	std.assert(std.streq(sub, "c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝"), "didn't get \"c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡"), "didn't get \"ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡\" as next grapheme, it was {}", rest)
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅ"), "didn't get \"a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅ\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "o̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚"), "didn't get \"o̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "s̓̍̍̄͏̖̞̟̱́͡͡͝"), "didn't get \"s̓̍̍̄͏̖̞̟̱́͡͡͝\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(sub.len == 0, "didn't get \"\" as last grapheme")
+
+	/* now with invalid UTF-8 */
+	s = [ ('A' : byte), ('b' : byte), (0xFE : byte),
+	      (0xFF : byte), (0x92 : byte), ('c' : byte) ][:]
+
+	(sub, rest) = std.bygrapheme(s)
+	std.assert(std.streq(sub, "A"), "didn't get \"A\" as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "b"), "didn't get \"b\" as next grapheme")
+
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, [ (0xFE : byte) ][:]), "didn't get 0xEE, len={} as next grapheme", sub.len)
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, [ (0xFF : byte) ][:]), "didn't get 0xEA as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, [ (0x92 : byte) ][:]), "didn't get 0xEF as next grapheme")
+
+	(sub, rest) = std.bygrapheme(rest)
+	std.assert(std.streq(sub, "c"), "didn't get \"c\" as next grapheme")
 }
diff --git a/lib/std/utf.myr b/lib/std/utf.myr
index 439254ca..fcf922a3 100644
--- a/lib/std/utf.myr
+++ b/lib/std/utf.myr
@@ -13,6 +13,7 @@ pkg std =
 	const encode	: (buf : byte[:], chr : char -> size)
 	const decode	: (buf : byte[:] -> char)
 	const strstep	: (str : byte[:] -> (char, byte[:]))
+	const bygrapheme : (str : byte[:] -> (byte[:], byte[:]))
 
 	const strcellwidth : (str : byte[:] -> size)
 ;;
@@ -63,6 +64,30 @@ const decode = {buf
 	-> c
 }
 
+const bygrapheme = {str
+	var len = 0
+	var rest = str
+	var c
+	var cn
+	var width
+
+	while rest.len > 0
+		(c, rest) = strstep(rest)
+		cn = cellwidth(c)
+
+		if (cn > 0 || c == Badchar) && width > 0
+			-> (str[:len], str[len:])
+		elif c == Badchar
+			-> (str[:1], str[1:])
+		else
+			len += charlen(c)
+			width += cn
+		;;
+	;;
+
+	-> (str[:len], str[len:])
+}
+
 const strstep = {str
 	var len
 	var mask
-- 
2.15.0


Follow-Ups:
Re: [PATCH 2/2] Implement bygrapheme()Ori Bernstein <ori@xxxxxxxxxxxxxx>
References:
[PATCH 0/2] Implement bygrapheme()"S. Gilles" <sgilles@xxxxxxxxxxxx>