From 7d0bc37efdcf41b54ece89d2fc8d1fc21d67a67f Mon Sep 17 00:00:00 2001
From: Edward Loveall <edward@edwardloveall.com>
Date: Sun, 30 Jan 2022 11:47:08 -0500
Subject: [PATCH] Fix markup errors caused by UTF-16/8 differences
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Medium uses UTF-16 character offsets (likely to make it easier to parse
in JavaScript) but Crystal uses UTF-8. Converting strings to UTF-16 to
do offset calculation then back to UFT-8 fixes some markup bugs.

---

Medium calculates markup offsets using UTF-16 encoding. Some characters
like Emoji are count as multiple bytes which affects those offsets. For
example in UTF-16 💸 is worth two bytes, but Crystal strings only count
it as one. This is a problem for markup generation because it can
offset the markup and even cause out-of-range errors.

Take the following example:

💸💸!

Imagine that `!` was bold but the emoji isn't. For Crystal, this starts
at char index 2, end at char index 3. Medium's markup will say markup
goes from character 4 to 5. In a 3 character string like this, trying
to access character range 4...5 is an error because 5 is already out of
bounds.

My theory is that this is meant to be compatible with JavaScript's
string length calculations, as Medium is primarily a platform built for
the web:

```js
"a".length // 1
"💸".length // 2
"👩‍❤️‍💋‍👩".length // 11
```

To get these same numbers in Crystal strings must be converted to
UTF-16:

```crystal
"a".to_utf16.size # 1
"💸".to_utf16.size # 2
"👩‍❤️‍💋‍👩".to_utf16.size # 11
```

The MarkupConverter now converts text into UFT-16 byte arrays on
initialization. Once it's figured out the range of bytes needed for
each piece of markup, it converts it back into UTF-8 strings.
---
 CHANGELOG                             |  4 ++++
 spec/classes/markup_converter_spec.cr | 15 +++++++++++++++
 src/classes/markup_converter.cr       | 11 +++++++----
 src/models/post_response.cr           | 11 +++++++++++
 src/version.cr                        |  2 +-
 5 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index ba2d066..ea91b24 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+2022-01-30
+
+* Fix bug in markup generation for text with multiple codepoints
+
 2022-01-29
 
 * Provide list of instances as JSON file
diff --git a/spec/classes/markup_converter_spec.cr b/spec/classes/markup_converter_spec.cr
index 79b956e..aca544c 100644
--- a/spec/classes/markup_converter_spec.cr
+++ b/spec/classes/markup_converter_spec.cr
@@ -153,6 +153,21 @@ describe MarkupConverter do
         Strong.new(children: [Text.new(" bold")] of Child),
       ])
     end
+
+    it "handles offsets from unicode text" do
+      markup = PostResponse::Markup.new(
+        type: PostResponse::MarkupType::STRONG,
+        start: 5,
+        end: 6
+      )
+
+      result = MarkupConverter.convert(text: "💸💸 <", markups: [markup])
+
+      result.should eq([
+        Text.new("💸💸 "),
+        Strong.new(children: [Text.new("<")] of Child),
+      ])
+    end
   end
 
   describe "#wrap_in_markups" do
diff --git a/src/classes/markup_converter.cr b/src/classes/markup_converter.cr
index dbbe6a4..20b9928 100644
--- a/src/classes/markup_converter.cr
+++ b/src/classes/markup_converter.cr
@@ -10,19 +10,19 @@ class MarkupConverter
   include Nodes
 
   getter markups : Array(PostResponse::Markup)
-  getter text : String
+  getter text : Slice(UInt16)
 
   def self.convert(text : String?, markups : Array(PostResponse::Markup))
     new(text, markups).convert
   end
 
   def initialize(text : String?, @markups : Array(PostResponse::Markup))
-    @text = text || ""
+    @text = (text || "").to_utf16
   end
 
   def convert : Array(Child)
     ranges.flat_map do |range_with_markups|
-      text_to_wrap = text[range_with_markups.range]
+      text_to_wrap = String.from_utf16(text[range_with_markups.range])
       wrap_in_markups(text_to_wrap, range_with_markups.markups)
     end
   end
@@ -39,7 +39,10 @@ class MarkupConverter
     end.to_a
   end
 
-  def wrap_in_markups(child : String | Child, markups : Array(PostResponse::Markup)) : Array(Child)
+  def wrap_in_markups(
+    child : String | Child,
+    markups : Array(PostResponse::Markup)
+  ) : Array(Child)
     if child.is_a?(String)
       child = Text.new(child)
     end
diff --git a/src/models/post_response.cr b/src/models/post_response.cr
index aedf898..478884b 100644
--- a/src/models/post_response.cr
+++ b/src/models/post_response.cr
@@ -74,6 +74,17 @@ class PostResponse
     property start : Int32
     property end : Int32
     property anchorType : AnchorType?
+
+    def initialize(
+      @type : MarkupType,
+      @start : Int32,
+      @end : Int32,
+      @title : String? = nil,
+      @href : String? = nil,
+      @userId : String? = nil,
+      @anchorType : AnchorType? = nil
+    )
+    end
   end
 
   enum MarkupType
diff --git a/src/version.cr b/src/version.cr
index bcebee0..526dd69 100644
--- a/src/version.cr
+++ b/src/version.cr
@@ -1,3 +1,3 @@
 module Scribe
-  VERSION = "2022-01-29"
+  VERSION = "2022-01-30"
 end