7d0bc37efd
Medium uses UTF-16 character offsets (likely to make it easier to parse in JavaScript) but Crystal uses UTF-8. Converting strings to UTF-16 to do offset calculation then back to UFT-8 fixes some markup bugs. --- Medium calculates markup offsets using UTF-16 encoding. Some characters like Emoji are count as multiple bytes which affects those offsets. For example in UTF-16 💸 is worth two bytes, but Crystal strings only count it as one. This is a problem for markup generation because it can offset the markup and even cause out-of-range errors. Take the following example: 💸💸! Imagine that `!` was bold but the emoji isn't. For Crystal, this starts at char index 2, end at char index 3. Medium's markup will say markup goes from character 4 to 5. In a 3 character string like this, trying to access character range 4...5 is an error because 5 is already out of bounds. My theory is that this is meant to be compatible with JavaScript's string length calculations, as Medium is primarily a platform built for the web: ```js "a".length // 1 "💸".length // 2 "👩❤️💋👩".length // 11 ``` To get these same numbers in Crystal strings must be converted to UTF-16: ```crystal "a".to_utf16.size # 1 "💸".to_utf16.size # 2 "👩❤️💋👩".to_utf16.size # 11 ``` The MarkupConverter now converts text into UFT-16 byte arrays on initialization. Once it's figured out the range of bytes needed for each piece of markup, it converts it back into UTF-8 strings.
211 lines
5.2 KiB
Crystal
211 lines
5.2 KiB
Crystal
require "../spec_helper"
|
|
|
|
include Nodes
|
|
|
|
describe MarkupConverter do
|
|
describe "#convert" do
|
|
it "returns just text with no markups" do
|
|
markups = [] of PostResponse::Markup
|
|
|
|
result = MarkupConverter.convert(text: "Hello, world", markups: markups)
|
|
|
|
result.should eq([Text.new(content: "Hello, world")])
|
|
end
|
|
|
|
it "returns text with multiple markups" do
|
|
markups = Array(PostResponse::Markup).from_json <<-JSON
|
|
[
|
|
{
|
|
"title": null,
|
|
"type": "STRONG",
|
|
"href": null,
|
|
"start": 0,
|
|
"end": 6,
|
|
"rel": null,
|
|
"anchorType": null
|
|
},
|
|
{
|
|
"title": null,
|
|
"type": "EM",
|
|
"href": null,
|
|
"start": 11,
|
|
"end": 21,
|
|
"rel": null,
|
|
"anchorType": null
|
|
}
|
|
]
|
|
JSON
|
|
|
|
result = MarkupConverter.convert(text: "strong and emphasized only", markups: markups)
|
|
|
|
result.should eq([
|
|
Strong.new(children: [Text.new(content: "strong")] of Child),
|
|
Text.new(content: " and "),
|
|
Emphasis.new(children: [Text.new(content: "emphasized")] of Child),
|
|
Text.new(content: " only"),
|
|
])
|
|
end
|
|
|
|
it "returns text with a code markup" do
|
|
markups = Array(PostResponse::Markup).from_json <<-JSON
|
|
[
|
|
{
|
|
"title": null,
|
|
"type": "CODE",
|
|
"href": null,
|
|
"start": 7,
|
|
"end": 11,
|
|
"rel": null,
|
|
"anchorType": null
|
|
}
|
|
]
|
|
JSON
|
|
|
|
result = MarkupConverter.convert(text: "inline code", markups: markups)
|
|
|
|
result.should eq([
|
|
Text.new(content: "inline "),
|
|
Code.new(children: [Text.new(content: "code")] of Child),
|
|
])
|
|
end
|
|
|
|
it "renders an A-LINK markup" do
|
|
markups = Array(PostResponse::Markup).from_json <<-JSON
|
|
[
|
|
{
|
|
"title": "",
|
|
"type": "A",
|
|
"href": "https://example.com",
|
|
"start": 7,
|
|
"end": 11,
|
|
"rel": "",
|
|
"anchorType": "LINK"
|
|
}
|
|
]
|
|
JSON
|
|
|
|
result = MarkupConverter.convert(text: "I am a Link", markups: markups)
|
|
|
|
result.should eq([
|
|
Text.new("I am a "),
|
|
Anchor.new(children: [Text.new("Link")] of Child, href: "https://example.com"),
|
|
])
|
|
end
|
|
|
|
it "renders an A-USER markup" do
|
|
markups = Array(PostResponse::Markup).from_json <<-JSON
|
|
[
|
|
{
|
|
"title": null,
|
|
"type": "A",
|
|
"href": null,
|
|
"userId": "abc123",
|
|
"start": 3,
|
|
"end": 10,
|
|
"rel": null,
|
|
"anchorType": "USER"
|
|
}
|
|
]
|
|
JSON
|
|
|
|
result = MarkupConverter.convert(text: "Hi Dr Nick!", markups: markups)
|
|
|
|
result.should eq([
|
|
Text.new("Hi "),
|
|
UserAnchor.new(children: [Text.new("Dr Nick")] of Child, user_id: "abc123"),
|
|
Text.new("!"),
|
|
])
|
|
end
|
|
|
|
it "renders overlapping markups" do
|
|
markups = Array(PostResponse::Markup).from_json <<-JSON
|
|
[
|
|
{
|
|
"title": null,
|
|
"type": "STRONG",
|
|
"href": null,
|
|
"userId": null,
|
|
"start": 7,
|
|
"end": 15,
|
|
"rel": null,
|
|
"anchorType": null
|
|
},
|
|
{
|
|
"title": null,
|
|
"type": "EM",
|
|
"href": null,
|
|
"userId": null,
|
|
"start": 0,
|
|
"end": 10,
|
|
"rel": null,
|
|
"anchorType": null
|
|
}
|
|
]
|
|
JSON
|
|
|
|
result = MarkupConverter.convert(text: "Italic and bold", markups: markups)
|
|
|
|
result.should eq([
|
|
Emphasis.new(children: [Text.new("Italic ")] of Child),
|
|
Emphasis.new(children: [
|
|
Strong.new(children: [Text.new("and")] of Child),
|
|
] of Child),
|
|
Strong.new(children: [Text.new(" bold")] of Child),
|
|
])
|
|
end
|
|
|
|
it "handles offsets from unicode text" do
|
|
markup = PostResponse::Markup.new(
|
|
type: PostResponse::MarkupType::STRONG,
|
|
start: 5,
|
|
end: 6
|
|
)
|
|
|
|
result = MarkupConverter.convert(text: "💸💸 <", markups: [markup])
|
|
|
|
result.should eq([
|
|
Text.new("💸💸 "),
|
|
Strong.new(children: [Text.new("<")] of Child),
|
|
])
|
|
end
|
|
end
|
|
|
|
describe "#wrap_in_markups" do
|
|
it "returns text wrapped in multiple markups" do
|
|
markups = Array(PostResponse::Markup).from_json <<-JSON
|
|
[
|
|
{
|
|
"title": null,
|
|
"type": "STRONG",
|
|
"href": null,
|
|
"start": 0,
|
|
"end": 17,
|
|
"rel": null,
|
|
"anchorType": null
|
|
},
|
|
{
|
|
"title": null,
|
|
"type": "A",
|
|
"href": null,
|
|
"userId": "abc123",
|
|
"start": 13,
|
|
"end": 17,
|
|
"rel": null,
|
|
"anchorType": "USER"
|
|
}
|
|
]
|
|
JSON
|
|
converter = MarkupConverter.new(text: "it's ya boi, jack", markups: markups)
|
|
|
|
result = converter.wrap_in_markups("jack", markups)
|
|
|
|
result.should eq([
|
|
UserAnchor.new(children: [
|
|
Strong.new([
|
|
Text.new("jack"),
|
|
] of Child),
|
|
] of Child, user_id: "abc123"),
|
|
])
|
|
end
|
|
end
|
|
end
|