Update partitioning

This commit is contained in:
Thelonius Kort
2023-01-14 22:01:21 +01:00
parent d7325d64c5
commit 705c7d2b53
2 changed files with 21 additions and 17 deletions

View File

@ -14,7 +14,9 @@ defmodule Outlook.InternalTree.RawInternalBasic do
def set_split_markers([ %InternalNode{type: :text} = textnode | rest ]) do def set_split_markers([ %InternalNode{type: :text} = textnode | rest ]) do
[ %InternalNode{textnode | [ %InternalNode{textnode |
content: textnode.content content: textnode.content
|> String.replace(~r/([[:upper:]\d])\.(\d)?/u, "\\1#{@nonperiodmarker}\\2") |> String.replace(~r/\.\.\.+/u, "")
|> String.replace(~r/([[:upper:]])\./u, "\\1#{@nonperiodmarker}")
|> String.replace(~r/(\d)\.(\d)/u, "\\1#{@nonperiodmarker}\\2")
|> String.replace(~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}") |> String.replace(~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}")
|> String.replace(@nonperiodmarker, ".") |> String.replace(@nonperiodmarker, ".")
} | set_split_markers(rest) ] } | set_split_markers(rest) ]

View File

@ -91,20 +91,20 @@ defmodule Outlook.InternalTreeTest do
] ]
end end
test "partition_text/1 doesn't split numbers and abbreviated names" do test "don't partition at numbers, repeated dots and abbreviated names" do
tree = [ tree = [
%InternalNode{ %Outlook.InternalTree.InternalNode{
name: "p", name: "p",
attributes: %{}, attributes: %{},
type: :element, type: :element,
nid: "0248aec7-c525-483d-a472-40a34488478d", nid: "oaRwUH3A2wMF",
content: [ content: [
%InternalNode{ %Outlook.InternalTree.InternalNode{
name: "", name: "",
attributes: %{}, attributes: %{},
type: :text, type: :text,
nid: "d35ac56f-bf10-47b1-af19-152e6225bb32", nid: "xep6gWMVWF1D",
content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. But this doesn't mean anything bad about Mike.", content: "This was written by F. William Endahl in 2021. 99.9% of it is not that bad... But!",
eph: %{sibling_with: :inline} eph: %{sibling_with: :inline}
} }
], ],
@ -112,21 +112,23 @@ defmodule Outlook.InternalTreeTest do
} }
] ]
assert InternalTree.partition_text(tree) |> unify_nids_in_tunits() == [ assert InternalTree.partition_text(tree) |> unify_nids_in_tunits() == [
%InternalNode{ %Outlook.InternalTree.InternalNode{
name: "p", name: "p",
attributes: %{}, attributes: %{},
type: :element, type: :element,
nid: "0248aec7-c525-483d-a472-40a34488478d", nid: "oaRwUH3A2wMF",
content: [ content: [
%TranslationUnit{ %Outlook.InternalTree.TranslationUnit{
status: :untranslated, status: :untranslated,
nid: @default_nid, nid: "xxxxxx",
content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. " content: "This was written by F. William Endahl in 2021. ",
eph: %{}
}, },
%TranslationUnit{ %Outlook.InternalTree.TranslationUnit{
status: :untranslated, status: :untranslated,
nid: @default_nid, nid: "xxxxxx",
content: "But this doesn't mean anything bad about Mike." content: "99.9% of it is not that bad… But!",
eph: %{}
} }
], ],
eph: %{sibling_with: :block} eph: %{sibling_with: :block}
@ -147,7 +149,7 @@ defmodule Outlook.InternalTreeTest do
attributes: %{}, attributes: %{},
type: :text, type: :text,
nid: "xep6gWMVWF1D", nid: "xep6gWMVWF1D",
content: "This Fit for 55 is the first time in the world that a group of countries, the EU, officially imposes an agenda to force an absurd “Zero” CO2 by 2050 and 55% less CO2 by 2030. EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for ", content: "EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for ",
eph: %{sibling_with: :inline} eph: %{sibling_with: :inline}
}, },
%Outlook.InternalTree.InternalNode{ %Outlook.InternalTree.InternalNode{
@ -192,7 +194,7 @@ defmodule Outlook.InternalTreeTest do
%Outlook.InternalTree.TranslationUnit{ %Outlook.InternalTree.TranslationUnit{
status: :untranslated, status: :untranslated,
nid: "xxxxxx", nid: "xxxxxx",
content: "This Fit for 55 is the first time in the world that a group of countries, the EU, officially imposes an agenda to force an absurd “Zero” CO2 by 2050 and 55% less CO2 by 2030. EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for <a href=\"https://www.politico.eu/article/fit-for-55-eu-5-things-to-know/\">clean transport.”</a>", content: "EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for <a href=\"https://www.politico.eu/article/fit-for-55-eu-5-things-to-know/\">clean transport.”</a>",
eph: %{} eph: %{}
}, },
%Outlook.InternalTree.TranslationUnit{ %Outlook.InternalTree.TranslationUnit{