[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 2/2 v2] Implement graphemestep

Subject: [PATCH 2/2 v2] Implement graphemestep
From: "S. Gilles" <sgilles@xxxxxxxxxxxx>
Reply-to: myrddin-dev@xxxxxxxxxxxxxx
Date: Sun, 5 Nov 2017 04:11:27 -0500
To: "myrddin-dev" <myrddin-dev@xxxxxxxxxxxxxx>
Cc: "S. Gilles" <sgilles@xxxxxxxxxxxx>
And change 'strstep' to 'charstep' for consistency, now that it has
a sibling function.
---

Agreed. After discussion in irc, I think the implementation is as
I want it, but the name was misleading. Since v1:

 - Rename to graphemestep, since it behaves similar to strstep.

 - While we're at it, rename strstep to charstep.

I'm completely open to bikeshedding about the naming convention.
In particular, I think it's a little confusing that 'str' isn't
anywhere in the scheme, but names like strstepchar and strstepgrapheme
seem a trifle too long.

---
 lib/date/fmt.myr      |  4 +--
 lib/date/parse.myr    | 10 +++----
 lib/http/parse.myr    |  2 +-
 lib/http/url.myr      |  2 +-
 lib/std/cmp.myr       |  4 +--
 lib/std/fmt.myr       |  4 +--
 lib/std/hashfuncs.myr |  6 ++---
 lib/std/optparse.myr  |  2 +-
 lib/std/striter.myr   |  2 +-
 lib/std/test/utf.myr  | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/std/utf.myr       | 33 ++++++++++++++++++++---
 mbld/parse.myr        |  6 ++---
 12 files changed, 123 insertions(+), 25 deletions(-)

diff --git a/lib/date/fmt.myr b/lib/date/fmt.myr
index 82f865b0..39df830b 100644
--- a/lib/date/fmt.myr
+++ b/lib/date/fmt.myr
@@ -40,9 +40,9 @@ const sbfmt = {sb, ap, opts
 const datefmt = {sb, fmt, d
 	var c
 	while fmt.len != 0
-		(c, fmt) = std.strstep(fmt)
+		(c, fmt) = std.charstep(fmt)
 		if c == '%'
-			(c, fmt) = std.strstep(fmt)
+			(c, fmt) = std.charstep(fmt)
 			match c
 			| 'a':	std.sbfmt(sb, "{}", _names.abbrevday[d.wday])
 			| 'A':	std.sbfmt(sb, "{}", _names.fullday[d.wday])
diff --git a/lib/date/parse.myr b/lib/date/parse.myr
index 19798ddf..7f2eafff 100644
--- a/lib/date/parse.myr
+++ b/lib/date/parse.myr
@@ -70,9 +70,9 @@ const filldate = {d, f, s, seen, err
 	z = ""
 	am = `std.None
 	while f.len != 0
-		(fc, f) = std.strstep(f)
+		(fc, f) = std.charstep(f)
 		if fc == '%'
-			(fc, f) = std.strstep(f)
+			(fc, f) = std.charstep(f)
 			if std.bshas(seen, fc)
 				err# = `std.Some `Doublefmt fc
 				-> s
@@ -120,7 +120,7 @@ const filldate = {d, f, s, seen, err
 			| _:	std.fatal("unknown format character %c\n", fc)
 			;;
 		else
-			(sc, s) = std.strstep(s)
+			(sc, s) = std.charstep(s)
 			if std.isspace(fc) && std.isspace(sc)
 				s = eatspace(s)
 			elif sc != fc
@@ -157,7 +157,7 @@ const eatspace = {s
 	var c
 
 	while std.isspace(std.decode(s))
-		(c, s) = std.strstep(s)
+		(c, s) = std.charstep(s)
 	;;
 	-> s
 }
@@ -246,7 +246,7 @@ generic intval = {dst : @a::(numeric,integral)#, s : byte[:], \
 
 	num = s
 	for i = 0; i < min; i++
-		(c, s) = std.strstep(s)
+		(c, s) = std.charstep(s)
 		if !std.isdigit(c)
 			err# = `std.Some `Shortint
 			-> s
diff --git a/lib/http/parse.myr b/lib/http/parse.myr
index fabf7a23..a91da3af 100644
--- a/lib/http/parse.myr
+++ b/lib/http/parse.myr
@@ -258,7 +258,7 @@ const parsenumber = {ln, base
 	s = ln#
 	ok = false
 	while true
-		(c, s) = std.strstep(s)
+		(c, s) = std.charstep(s)
 		dig = std.charval(c, base)
 		if dig >= 0 && dig < base
 			ok = true
diff --git a/lib/http/url.myr b/lib/http/url.myr
index be17d205..038305f5 100644
--- a/lib/http/url.myr
+++ b/lib/http/url.myr
@@ -150,7 +150,7 @@ const parseparams = {url
 	;;
 
 	match std.decode(url#)
-	| '?':	(_, url#) = std.strstep(url#)
+	| '?':	(_, url#) = std.charstep(url#)
 	| _:	-> `std.Err `Egarbled
 	;;
 
diff --git a/lib/std/cmp.myr b/lib/std/cmp.myr
index 15c49948..c022b2ec 100644
--- a/lib/std/cmp.myr
+++ b/lib/std/cmp.myr
@@ -60,8 +60,8 @@ const strcasecmp = {a, b
 	var ca, cb
 
 	while a.len > 0 && b.len > 0
-		(ca, a) = std.strstep(a)
-		(cb, b) = std.strstep(b)
+		(ca, a) = std.charstep(a)
+		(cb, b) = std.charstep(b)
 		ca = toupper(ca)
 		cb = toupper(cb)
 		if ca < cb
diff --git a/lib/std/fmt.myr b/lib/std/fmt.myr
index 1f542d31..48191e9c 100644
--- a/lib/std/fmt.myr
+++ b/lib/std/fmt.myr
@@ -163,11 +163,11 @@ const sbfmtv = {sb, fmt, ap -> size
 	nparams = ap.tc.nelt
 	nfmt = 0
 	while fmt.len != 0
-		(c, fmt) = strstep(fmt)
+		(c, fmt) = charstep(fmt)
 		match c
 		| '{':
 			if decode(fmt) == '{'
-				(c, fmt) = strstep(fmt)
+				(c, fmt) = charstep(fmt)
 				sbputc(sb, '{')
 			else
 				(params, fmt) = getparams(fmt)
diff --git a/lib/std/hashfuncs.myr b/lib/std/hashfuncs.myr
index da47215f..08a83aab 100644
--- a/lib/std/hashfuncs.myr
+++ b/lib/std/hashfuncs.myr
@@ -50,8 +50,8 @@ const strcaseeq = {a, b
 		if a.len == 0 || b.len == 0
 			break
 		;;
-		(ca, a) = std.strstep(a)
-		(cb, b) = std.strstep(b)
+		(ca, a) = std.charstep(a)
+		(cb, b) = std.charstep(b)
 		if std.tolower(ca) != std.tolower(cb)
 			-> false
 		;;
@@ -65,7 +65,7 @@ const strcasehash = {s
 
 	chars = [][:]
 	while s.len != 0
-		(c, s) = std.strstep(s)
+		(c, s) = std.charstep(s)
 		std.slpush(&chars, std.tolower(c))
 	;;
 	h = siphash24(slbytes(chars), Seed)
diff --git a/lib/std/optparse.myr b/lib/std/optparse.myr
index 0569dfc7..6202aa4a 100644
--- a/lib/std/optparse.myr
+++ b/lib/std/optparse.myr
@@ -105,7 +105,7 @@ const optnext = {ctx
 	var c
 	var arg
 
-	(c, ctx.curarg) = strstep(ctx.curarg)
+	(c, ctx.curarg) = charstep(ctx.curarg)
 
 	match optinfo(ctx, c)
 	| `None:
diff --git a/lib/std/striter.myr b/lib/std/striter.myr
index 3c81c98c..0dd1b022 100644
--- a/lib/std/striter.myr
+++ b/lib/std/striter.myr
@@ -33,7 +33,7 @@ impl iterable chariter -> char =
 		if ci.rest.len == 0
 			-> false
 		;;
-		(c#, ci.rest) = strstep(ci.rest)
+		(c#, ci.rest) = charstep(ci.rest)
 		-> true
 	}
 
diff --git a/lib/std/test/utf.myr b/lib/std/test/utf.myr
index 9f73f5ae..8778b05c 100644
--- a/lib/std/test/utf.myr
+++ b/lib/std/test/utf.myr
@@ -28,4 +28,77 @@ const main = {
 		"wrong width of runes")
 	std.assert(std.strcellwidth("𒀸 𒌋𒅗 𒆷 𒂅𒌒 𒍜 𒀭𒉌𒄿 𒈗 𒁁𒉌 𒋬") == 22, \
 		"wrong width of Cuneiform")
+
+	/* graphemestep() */
+	var s = "a史cЯx̀̀̀̀̀yz̉"
+	var sub, rest
+
+	(sub, rest) = std.graphemestep(s)
+	std.assert(std.streq(sub, "a"), "didn't get \"a\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "史"), "didn't get \"史\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "c"), "didn't get \"c\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "Я"), "didn't get \"Я\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "x̀̀̀̀̀"), "didn't get \"x̀̀̀̀̀\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "y"), "didn't get \"y\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "z̉"), "didn't get \"z̉\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(sub.len == 0, "didn't get \"\" as last grapheme")
+
+
+	/* with excessive combiners */
+	s = "c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅo̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚s̓̍̍̄͏̖̞̟̱́͡͡͝"
+
+	(sub, rest) = std.graphemestep(s)
+	std.assert(std.streq(sub, "c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝"), "didn't get \"c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡"), "didn't get \"ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡\" as next grapheme, it was {}", rest)
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅ"), "didn't get \"a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅ\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "o̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚"), "didn't get \"o̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "s̓̍̍̄͏̖̞̟̱́͡͡͝"), "didn't get \"s̓̍̍̄͏̖̞̟̱́͡͡͝\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(sub.len == 0, "didn't get \"\" as last grapheme")
+
+	/* now with invalid UTF-8 */
+	s = [ ('A' : byte), ('b' : byte), (0xFE : byte),
+	      (0xFF : byte), (0x92 : byte), ('c' : byte) ][:]
+
+	(sub, rest) = std.graphemestep(s)
+	std.assert(std.streq(sub, "A"), "didn't get \"A\" as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "b"), "didn't get \"b\" as next grapheme")
+
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, [ (0xFE : byte) ][:]), "didn't get 0xEE, len={} as next grapheme", sub.len)
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, [ (0xFF : byte) ][:]), "didn't get 0xEA as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, [ (0x92 : byte) ][:]), "didn't get 0xEF as next grapheme")
+
+	(sub, rest) = std.graphemestep(rest)
+	std.assert(std.streq(sub, "c"), "didn't get \"c\" as next grapheme")
 }
diff --git a/lib/std/utf.myr b/lib/std/utf.myr
index 439254ca..9e297b33 100644
--- a/lib/std/utf.myr
+++ b/lib/std/utf.myr
@@ -12,7 +12,8 @@ pkg std =
 	const charlen	: (chr : char -> size)
 	const encode	: (buf : byte[:], chr : char -> size)
 	const decode	: (buf : byte[:] -> char)
-	const strstep	: (str : byte[:] -> (char, byte[:]))
+	const charstep	: (str : byte[:] -> (char, byte[:]))
+	const graphemestep : (str : byte[:] -> (byte[:], byte[:]))
 
 	const strcellwidth : (str : byte[:] -> size)
 ;;
@@ -59,11 +60,35 @@ const decode = {buf
 	var c
 	var b
 
-	(c, b) = strstep(buf)
+	(c, b) = charstep(buf)
 	-> c
 }
 
-const strstep = {str
+const graphemestep = {str
+	var len = 0
+	var rest = str
+	var c
+	var cn
+	var width
+
+	while rest.len > 0
+		(c, rest) = charstep(rest)
+		cn = cellwidth(c)
+
+		if (cn > 0 || c == Badchar) && width > 0
+			-> (str[:len], str[len:])
+		elif c == Badchar
+			-> (str[:1], str[1:])
+		else
+			len += charlen(c)
+			width += cn
+		;;
+	;;
+
+	-> (str[:len], str[len:])
+}
+
+const charstep = {str
 	var len
 	var mask
 	var chr
@@ -111,7 +136,7 @@ const strcellwidth = {str
 	var n : size = 0
 
 	while s.len > 0
-		(c, s) = strstep(s)
+		(c, s) = charstep(s)
 		if c == Badchar
 			/* Something will probably be printed as U+FFFD */
 			n++
diff --git a/mbld/parse.myr b/mbld/parse.myr
index 0f1d0533..96e47d5b 100644
--- a/mbld/parse.myr
+++ b/mbld/parse.myr
@@ -732,8 +732,8 @@ const skipspace = {p
 const matchc = {p, c
 	var chr, s
 
-	/* safe to use at eof: strstep returns (-1, "") */
-	(chr, s) = std.strstep(p.rest)
+	/* safe to use at eof: charstep returns (-1, "") */
+	(chr, s) = std.charstep(p.rest)
 	if c == chr
 		p.rest = s
 		-> true
@@ -749,7 +749,7 @@ const peekc = {p
 const getc = {p
 	var c, s
 
-	(c, s) = std.strstep(p.rest)
+	(c, s) = std.charstep(p.rest)
 	p.rest = s
 	-> c
 }
-- 
2.15.0
Follow-Ups:
Re: [PATCH 2/2 v2] Implement graphemestep	Ori Bernstein <ori@xxxxxxxxxxxxxx>
References:
Re: [PATCH 2/2] Implement bygrapheme()	Ori Bernstein <ori@xxxxxxxxxxxxxx>
Prev by Date: Re: [PATCH 2/2] Implement bygrapheme()
Next by Date: Re: [PATCH 2/2 v2] Implement graphemestep
Previous by thread: Re: [PATCH 2/2] Implement bygrapheme()
Next by thread: Re: [PATCH 2/2 v2] Implement graphemestep
Index(es):
- Main
- Thread