From 9a7dc7cf80d832ad24a4678a5a042f71c9ac96d6 Mon Sep 17 00:00:00 2001 From: Thelonius Kort Date: Wed, 4 Jan 2023 15:26:05 +0100 Subject: [PATCH] Add ignoring non-period points/dots --- .../internal_tree/raw_internal_basic.ex | 6 ++- test/outlook/internaltree_test.exs | 42 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/lib/outlook/internal_tree/raw_internal_basic.ex b/lib/outlook/internal_tree/raw_internal_basic.ex index cdaaf84..63f347c 100644 --- a/lib/outlook/internal_tree/raw_internal_basic.ex +++ b/lib/outlook/internal_tree/raw_internal_basic.ex @@ -10,10 +10,14 @@ defmodule Outlook.InternalTree.RawInternalBasic do alias Outlook.InternalTree.Html @splitmarker "@@translationunit@@" + @nonperiodmarker "@@nonperiod@@" def set_split_markers([ %InternalNode{type: :text} = textnode | rest ]) do [ %InternalNode{textnode | - content: String.replace(textnode.content, ~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}") + content: textnode.content + |> String.replace(~r/([[:upper:]\d])\.(\d)?/u, "\\1#{@nonperiodmarker}\\2") + |> String.replace(~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}") + |> String.replace(@nonperiodmarker, ".") } | set_split_markers(rest) ] end diff --git a/test/outlook/internaltree_test.exs b/test/outlook/internaltree_test.exs index c82ae1f..2bd6afc 100644 --- a/test/outlook/internaltree_test.exs +++ b/test/outlook/internaltree_test.exs @@ -87,6 +87,48 @@ defmodule Outlook.InternalTreeTest do %TranslationUnit{status: :untranslated, uuid: @default_uuid, content: "A sentence with many letters and many, many words. "}], eph: %{sibling_with: :block}}] + + test "partition_text/1 doesn't split numbers and abbreviated names" do + tree = [ + %InternalNode{ + name: "p", + attributes: %{}, + type: :element, + uuid: "0248aec7-c525-483d-a472-40a34488478d", + content: [ + %InternalNode{ + name: "", + attributes: %{}, + type: :text, + uuid: "d35ac56f-bf10-47b1-af19-152e6225bb32", + content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. But this doesn't mean anything bad about Mike.", + eph: %{sibling_with: :inline} + } + ], + eph: %{sibling_with: :block} + } + ] + assert InternalTree.partition_text(tree) |> unify_uuids_in_tunits() == [ + %InternalNode{ + name: "p", + attributes: %{}, + type: :element, + uuid: "0248aec7-c525-483d-a472-40a34488478d", + content: [ + %TranslationUnit{ + status: :untranslated, + uuid: @default_uuid, + content: "F. William Engdahl is 3.7 times more likely to write a good article than Mike Adams. " + }, + %TranslationUnit{ + status: :untranslated, + uuid: @default_uuid, + content: "But this doesn't mean anything bad about Mike." + } + ], + eph: %{sibling_with: :block} + } + ] end end end