Fix markup errors caused by UTF-16/8 differences
Medium uses UTF-16 character offsets (likely to make it easier to parse in JavaScript) but Crystal uses UTF-8. Converting strings to UTF-16 to do offset calculation then back to UFT-8 fixes some markup bugs. --- Medium calculates markup offsets using UTF-16 encoding. Some characters like Emoji are count as multiple bytes which affects those offsets. For example in UTF-16 💸 is worth two bytes, but Crystal strings only count it as one. This is a problem for markup generation because it can offset the markup and even cause out-of-range errors. Take the following example: 💸💸! Imagine that `!` was bold but the emoji isn't. For Crystal, this starts at char index 2, end at char index 3. Medium's markup will say markup goes from character 4 to 5. In a 3 character string like this, trying to access character range 4...5 is an error because 5 is already out of bounds. My theory is that this is meant to be compatible with JavaScript's string length calculations, as Medium is primarily a platform built for the web: ```js "a".length // 1 "💸".length // 2 "👩❤️💋👩".length // 11 ``` To get these same numbers in Crystal strings must be converted to UTF-16: ```crystal "a".to_utf16.size # 1 "💸".to_utf16.size # 2 "👩❤️💋👩".to_utf16.size # 11 ``` The MarkupConverter now converts text into UFT-16 byte arrays on initialization. Once it's figured out the range of bytes needed for each piece of markup, it converts it back into UTF-8 strings.
This commit is contained in:
parent
648a933b24
commit
7d0bc37efd
5 changed files with 38 additions and 5 deletions
|
@ -1,3 +1,7 @@
|
||||||
|
2022-01-30
|
||||||
|
|
||||||
|
* Fix bug in markup generation for text with multiple codepoints
|
||||||
|
|
||||||
2022-01-29
|
2022-01-29
|
||||||
|
|
||||||
* Provide list of instances as JSON file
|
* Provide list of instances as JSON file
|
||||||
|
|
|
@ -153,6 +153,21 @@ describe MarkupConverter do
|
||||||
Strong.new(children: [Text.new(" bold")] of Child),
|
Strong.new(children: [Text.new(" bold")] of Child),
|
||||||
])
|
])
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "handles offsets from unicode text" do
|
||||||
|
markup = PostResponse::Markup.new(
|
||||||
|
type: PostResponse::MarkupType::STRONG,
|
||||||
|
start: 5,
|
||||||
|
end: 6
|
||||||
|
)
|
||||||
|
|
||||||
|
result = MarkupConverter.convert(text: "💸💸 <", markups: [markup])
|
||||||
|
|
||||||
|
result.should eq([
|
||||||
|
Text.new("💸💸 "),
|
||||||
|
Strong.new(children: [Text.new("<")] of Child),
|
||||||
|
])
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe "#wrap_in_markups" do
|
describe "#wrap_in_markups" do
|
||||||
|
|
|
@ -10,19 +10,19 @@ class MarkupConverter
|
||||||
include Nodes
|
include Nodes
|
||||||
|
|
||||||
getter markups : Array(PostResponse::Markup)
|
getter markups : Array(PostResponse::Markup)
|
||||||
getter text : String
|
getter text : Slice(UInt16)
|
||||||
|
|
||||||
def self.convert(text : String?, markups : Array(PostResponse::Markup))
|
def self.convert(text : String?, markups : Array(PostResponse::Markup))
|
||||||
new(text, markups).convert
|
new(text, markups).convert
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(text : String?, @markups : Array(PostResponse::Markup))
|
def initialize(text : String?, @markups : Array(PostResponse::Markup))
|
||||||
@text = text || ""
|
@text = (text || "").to_utf16
|
||||||
end
|
end
|
||||||
|
|
||||||
def convert : Array(Child)
|
def convert : Array(Child)
|
||||||
ranges.flat_map do |range_with_markups|
|
ranges.flat_map do |range_with_markups|
|
||||||
text_to_wrap = text[range_with_markups.range]
|
text_to_wrap = String.from_utf16(text[range_with_markups.range])
|
||||||
wrap_in_markups(text_to_wrap, range_with_markups.markups)
|
wrap_in_markups(text_to_wrap, range_with_markups.markups)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -39,7 +39,10 @@ class MarkupConverter
|
||||||
end.to_a
|
end.to_a
|
||||||
end
|
end
|
||||||
|
|
||||||
def wrap_in_markups(child : String | Child, markups : Array(PostResponse::Markup)) : Array(Child)
|
def wrap_in_markups(
|
||||||
|
child : String | Child,
|
||||||
|
markups : Array(PostResponse::Markup)
|
||||||
|
) : Array(Child)
|
||||||
if child.is_a?(String)
|
if child.is_a?(String)
|
||||||
child = Text.new(child)
|
child = Text.new(child)
|
||||||
end
|
end
|
||||||
|
|
|
@ -74,6 +74,17 @@ class PostResponse
|
||||||
property start : Int32
|
property start : Int32
|
||||||
property end : Int32
|
property end : Int32
|
||||||
property anchorType : AnchorType?
|
property anchorType : AnchorType?
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
@type : MarkupType,
|
||||||
|
@start : Int32,
|
||||||
|
@end : Int32,
|
||||||
|
@title : String? = nil,
|
||||||
|
@href : String? = nil,
|
||||||
|
@userId : String? = nil,
|
||||||
|
@anchorType : AnchorType? = nil
|
||||||
|
)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
enum MarkupType
|
enum MarkupType
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
module Scribe
|
module Scribe
|
||||||
VERSION = "2022-01-29"
|
VERSION = "2022-01-30"
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue