Add ignoring non-period points/dots

This commit is contained in:
Thelonius Kort
2023-01-04 15:26:05 +01:00
parent b0f8778c07
commit 9a7dc7cf80
2 changed files with 47 additions and 1 deletions

View File

@ -10,10 +10,14 @@ defmodule Outlook.InternalTree.RawInternalBasic do
alias Outlook.InternalTree.Html
@splitmarker "@@translationunit@@"
@nonperiodmarker "@@nonperiod@@"
def set_split_markers([ %InternalNode{type: :text} = textnode | rest ]) do
[ %InternalNode{textnode |
content: String.replace(textnode.content, ~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}")
content: textnode.content
|> String.replace(~r/([[:upper:]\d])\.(\d)?/u, "\\1#{@nonperiodmarker}\\2")
|> String.replace(~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}")
|> String.replace(@nonperiodmarker, ".")
} | set_split_markers(rest) ]
end

View File

@ -87,6 +87,48 @@ defmodule Outlook.InternalTreeTest do
%TranslationUnit{status: :untranslated, uuid: @default_uuid,
content: "<a href=\"dingsda.com\"><b>A</b> sentence</a> with many letters and many, many words. "}],
eph: %{sibling_with: :block}}]
test "partition_text/1 doesn't split numbers and abbreviated names" do
tree = [
%InternalNode{
name: "p",
attributes: %{},
type: :element,
uuid: "0248aec7-c525-483d-a472-40a34488478d",
content: [
%InternalNode{
name: "",
attributes: %{},
type: :text,
uuid: "d35ac56f-bf10-47b1-af19-152e6225bb32",
content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. But this doesn't mean anything bad about Mike.",
eph: %{sibling_with: :inline}
}
],
eph: %{sibling_with: :block}
}
]
assert InternalTree.partition_text(tree) |> unify_uuids_in_tunits() == [
%InternalNode{
name: "p",
attributes: %{},
type: :element,
uuid: "0248aec7-c525-483d-a472-40a34488478d",
content: [
%TranslationUnit{
status: :untranslated,
uuid: @default_uuid,
content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. "
},
%TranslationUnit{
status: :untranslated,
uuid: @default_uuid,
content: "But this doesn't mean anything bad about Mike."
}
],
eph: %{sibling_with: :block}
}
]
end
end
end