clean(s)
glib_make_valid = (s) ->
ptr = C.g_utf8_make_valid(s, #s)
s = ffi.string ptr
C.g_free(ptr)
s
utf8_clean = (s) ->
r, size = clean s
ffi.string r, size
assert_clean = (s, expected) ->
rs = utf8_clean s
unless rs == expected
assert.equal to_hex_string(expected), to_hex_string(rs)
returns a clean string as is
assert_clean '123456789', '123456789'
assert_clean "åäöƏ⏱🌨", "åäöƏ⏱🌨"
cleans up incorrect dual sequences
assert_clean '|\xc3\x24|', '|�$|'
assert_clean '|\xc3\x24\xc3\x61|', '|�$�a|'
assert_clean '|\xc3\x24X\xc3\x61|', '|�$X�a|'
cleans up incorrect three-byte sequences
assert_clean '|\xe1\x24|', '|�$|'
assert_clean '|\xe1\x80\x24|', '|��$|'
cleans up incorrect four-byte sequences
assert_clean '|\xf0\x24|', '|�$|'
assert_clean '|\xf0\x80\x24|', '|��$|'
assert_clean '|\xf0\x80\x80\x24|', '|���$|'
cleans up stray continuation bytes
assert_clean '|\x80|', '|�|'
assert_clean '|\x80\x80|', '|��|'
assert_clean '\x80|', '�|'
assert_clean '|\x80', '|�'
assert_clean '\xc2\xa9\xa9', '©�'
cleans up illegal bytes
for b = 192, 193
assert_clean "|#{string.char(b)}|", '|�|'
for b = 245, 255
assert_clean "|#{string.char(b)}|", '|�|'
cleans up broken utf8 at the end
assert_clean '\x8d\xc7\xe0', '���'
handles sequence starts within sequences
assert_clean '\xc7\xe0\x60\x28\x8c', '��`(�'
handles illegal values in sequences
assert_clean '\xc4\xf7\x61\xb9', '��a�'
if false
time = (title, f) ->
start = get_monotonic_time!
f!
done = get_monotonic_time!
elapsed = (done - start) / 1000000
print "'#{title}': #{elapsed} elapsed"
valid = string.rep 'abcdefghijklmnopqrstuvxyzABCDEFGHIJKLMNOPQRSTUVXYZ', 1000
for i = 1, 100
C.g_utf8_validate valid, #valid, nil
time 'g_utf8_validate', ->
for i = 1, 1000
C.g_utf8_validate valid, #valid, nil
for i = 1, 100
is_valid valid
time 'own is_valid', ->
for i = 1, 1000
is_valid valid
for i = 1, 100
glib_make_valid(valid)
time 'g_utf8_make_valid CLEAN', ->
for i = 1, 1000
ptr = C.g_utf8_make_valid(valid, #valid)
C.g_free(ptr)
for i = 1, 100
clean valid
time 'own clean CLEAN', ->
for i = 1, 1000
clean valid
broken = string.rep 'ab⏱🌨\xc3\x24hiåäömn\xe1\x80\x24opq\xf0\x24', 1000
for i = 1, 100
glib_make_valid(broken)
time 'g_utf8_make_valid BROKEN', ->
for i = 1, 1000
ptr = C.g_utf8_make_valid(broken, #broken)
C.g_free(ptr)
for i = 1, 100
clean broken
time 'own clean BROKEN', ->
for i = 1, 1000
clean broken