From 7d0bc37efdcf41b54ece89d2fc8d1fc21d67a67f Mon Sep 17 00:00:00 2001 From: Edward Loveall Date: Sun, 30 Jan 2022 11:47:08 -0500 Subject: [PATCH] Fix markup errors caused by UTF-16/8 differences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Medium uses UTF-16 character offsets (likely to make it easier to parse in JavaScript) but Crystal uses UTF-8. Converting strings to UTF-16 to do offset calculation then back to UFT-8 fixes some markup bugs. --- Medium calculates markup offsets using UTF-16 encoding. Some characters like Emoji are count as multiple bytes which affects those offsets. For example in UTF-16 πŸ’Έ is worth two bytes, but Crystal strings only count it as one. This is a problem for markup generation because it can offset the markup and even cause out-of-range errors. Take the following example: πŸ’ΈπŸ’Έ! Imagine that `!` was bold but the emoji isn't. For Crystal, this starts at char index 2, end at char index 3. Medium's markup will say markup goes from character 4 to 5. In a 3 character string like this, trying to access character range 4...5 is an error because 5 is already out of bounds. My theory is that this is meant to be compatible with JavaScript's string length calculations, as Medium is primarily a platform built for the web: ```js "a".length // 1 "πŸ’Έ".length // 2 "πŸ‘©β€β€οΈβ€πŸ’‹β€πŸ‘©".length // 11 ``` To get these same numbers in Crystal strings must be converted to UTF-16: ```crystal "a".to_utf16.size # 1 "πŸ’Έ".to_utf16.size # 2 "πŸ‘©β€β€οΈβ€πŸ’‹β€πŸ‘©".to_utf16.size # 11 ``` The MarkupConverter now converts text into UFT-16 byte arrays on initialization. Once it's figured out the range of bytes needed for each piece of markup, it converts it back into UTF-8 strings. --- CHANGELOG | 4 ++++ spec/classes/markup_converter_spec.cr | 15 +++++++++++++++ src/classes/markup_converter.cr | 11 +++++++---- src/models/post_response.cr | 11 +++++++++++ src/version.cr | 2 +- 5 files changed, 38 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ba2d066..ea91b24 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +2022-01-30 + +* Fix bug in markup generation for text with multiple codepoints + 2022-01-29 * Provide list of instances as JSON file diff --git a/spec/classes/markup_converter_spec.cr b/spec/classes/markup_converter_spec.cr index 79b956e..aca544c 100644 --- a/spec/classes/markup_converter_spec.cr +++ b/spec/classes/markup_converter_spec.cr @@ -153,6 +153,21 @@ describe MarkupConverter do Strong.new(children: [Text.new(" bold")] of Child), ]) end + + it "handles offsets from unicode text" do + markup = PostResponse::Markup.new( + type: PostResponse::MarkupType::STRONG, + start: 5, + end: 6 + ) + + result = MarkupConverter.convert(text: "πŸ’ΈπŸ’Έ <", markups: [markup]) + + result.should eq([ + Text.new("πŸ’ΈπŸ’Έ "), + Strong.new(children: [Text.new("<")] of Child), + ]) + end end describe "#wrap_in_markups" do diff --git a/src/classes/markup_converter.cr b/src/classes/markup_converter.cr index dbbe6a4..20b9928 100644 --- a/src/classes/markup_converter.cr +++ b/src/classes/markup_converter.cr @@ -10,19 +10,19 @@ class MarkupConverter include Nodes getter markups : Array(PostResponse::Markup) - getter text : String + getter text : Slice(UInt16) def self.convert(text : String?, markups : Array(PostResponse::Markup)) new(text, markups).convert end def initialize(text : String?, @markups : Array(PostResponse::Markup)) - @text = text || "" + @text = (text || "").to_utf16 end def convert : Array(Child) ranges.flat_map do |range_with_markups| - text_to_wrap = text[range_with_markups.range] + text_to_wrap = String.from_utf16(text[range_with_markups.range]) wrap_in_markups(text_to_wrap, range_with_markups.markups) end end @@ -39,7 +39,10 @@ class MarkupConverter end.to_a end - def wrap_in_markups(child : String | Child, markups : Array(PostResponse::Markup)) : Array(Child) + def wrap_in_markups( + child : String | Child, + markups : Array(PostResponse::Markup) + ) : Array(Child) if child.is_a?(String) child = Text.new(child) end diff --git a/src/models/post_response.cr b/src/models/post_response.cr index aedf898..478884b 100644 --- a/src/models/post_response.cr +++ b/src/models/post_response.cr @@ -74,6 +74,17 @@ class PostResponse property start : Int32 property end : Int32 property anchorType : AnchorType? + + def initialize( + @type : MarkupType, + @start : Int32, + @end : Int32, + @title : String? = nil, + @href : String? = nil, + @userId : String? = nil, + @anchorType : AnchorType? = nil + ) + end end enum MarkupType diff --git a/src/version.cr b/src/version.cr index bcebee0..526dd69 100644 --- a/src/version.cr +++ b/src/version.cr @@ -1,3 +1,3 @@ module Scribe - VERSION = "2022-01-29" + VERSION = "2022-01-30" end