Update partitioning
This commit is contained in:
@ -14,7 +14,9 @@ defmodule Outlook.InternalTree.RawInternalBasic do
|
||||
def set_split_markers([ %InternalNode{type: :text} = textnode | rest ]) do
|
||||
[ %InternalNode{textnode |
|
||||
content: textnode.content
|
||||
|> String.replace(~r/([[:upper:]\d])\.(\d)?/u, "\\1#{@nonperiodmarker}\\2")
|
||||
|> String.replace(~r/\.\.\.+/u, "…")
|
||||
|> String.replace(~r/([[:upper:]])\./u, "\\1#{@nonperiodmarker}")
|
||||
|> String.replace(~r/(\d)\.(\d)/u, "\\1#{@nonperiodmarker}\\2")
|
||||
|> String.replace(~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}")
|
||||
|> String.replace(@nonperiodmarker, ".")
|
||||
} | set_split_markers(rest) ]
|
||||
|
||||
@ -91,20 +91,20 @@ defmodule Outlook.InternalTreeTest do
|
||||
]
|
||||
end
|
||||
|
||||
test "partition_text/1 doesn't split numbers and abbreviated names" do
|
||||
test "don't partition at numbers, repeated dots and abbreviated names" do
|
||||
tree = [
|
||||
%InternalNode{
|
||||
%Outlook.InternalTree.InternalNode{
|
||||
name: "p",
|
||||
attributes: %{},
|
||||
type: :element,
|
||||
nid: "0248aec7-c525-483d-a472-40a34488478d",
|
||||
nid: "oaRwUH3A2wMF",
|
||||
content: [
|
||||
%InternalNode{
|
||||
%Outlook.InternalTree.InternalNode{
|
||||
name: "",
|
||||
attributes: %{},
|
||||
type: :text,
|
||||
nid: "d35ac56f-bf10-47b1-af19-152e6225bb32",
|
||||
content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. But this doesn't mean anything bad about Mike.",
|
||||
nid: "xep6gWMVWF1D",
|
||||
content: "This was written by F. William Endahl in 2021. 99.9% of it is not that bad... But!",
|
||||
eph: %{sibling_with: :inline}
|
||||
}
|
||||
],
|
||||
@ -112,21 +112,23 @@ defmodule Outlook.InternalTreeTest do
|
||||
}
|
||||
]
|
||||
assert InternalTree.partition_text(tree) |> unify_nids_in_tunits() == [
|
||||
%InternalNode{
|
||||
%Outlook.InternalTree.InternalNode{
|
||||
name: "p",
|
||||
attributes: %{},
|
||||
type: :element,
|
||||
nid: "0248aec7-c525-483d-a472-40a34488478d",
|
||||
nid: "oaRwUH3A2wMF",
|
||||
content: [
|
||||
%TranslationUnit{
|
||||
%Outlook.InternalTree.TranslationUnit{
|
||||
status: :untranslated,
|
||||
nid: @default_nid,
|
||||
content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. "
|
||||
nid: "xxxxxx",
|
||||
content: "This was written by F. William Endahl in 2021. ",
|
||||
eph: %{}
|
||||
},
|
||||
%TranslationUnit{
|
||||
%Outlook.InternalTree.TranslationUnit{
|
||||
status: :untranslated,
|
||||
nid: @default_nid,
|
||||
content: "But this doesn't mean anything bad about Mike."
|
||||
nid: "xxxxxx",
|
||||
content: "99.9% of it is not that bad… But!",
|
||||
eph: %{}
|
||||
}
|
||||
],
|
||||
eph: %{sibling_with: :block}
|
||||
@ -147,7 +149,7 @@ defmodule Outlook.InternalTreeTest do
|
||||
attributes: %{},
|
||||
type: :text,
|
||||
nid: "xep6gWMVWF1D",
|
||||
content: "This Fit for 55 is the first time in the world that a group of countries, the EU, officially imposes an agenda to force an absurd “Zero” CO2 by 2050 and 55% less CO2 by 2030. EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for ",
|
||||
content: "EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for ",
|
||||
eph: %{sibling_with: :inline}
|
||||
},
|
||||
%Outlook.InternalTree.InternalNode{
|
||||
@ -192,7 +194,7 @@ defmodule Outlook.InternalTreeTest do
|
||||
%Outlook.InternalTree.TranslationUnit{
|
||||
status: :untranslated,
|
||||
nid: "xxxxxx",
|
||||
content: "This Fit for 55 is the first time in the world that a group of countries, the EU, officially imposes an agenda to force an absurd “Zero” CO2 by 2050 and 55% less CO2 by 2030. EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for <a href=\"https://www.politico.eu/article/fit-for-55-eu-5-things-to-know/\">clean transport.”</a>",
|
||||
content: "EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for <a href=\"https://www.politico.eu/article/fit-for-55-eu-5-things-to-know/\">clean transport.”</a>",
|
||||
eph: %{}
|
||||
},
|
||||
%Outlook.InternalTree.TranslationUnit{
|
||||
|
||||
Reference in New Issue
Block a user