napulitano
New Member
Pattern\[code\]1 = < font=?'Bold' bbox=F'l ..' s > < (~s) >*2 = < font=!'Bold' bbox=F'l ..' s=RM'\s*(.*' > | (1.l < 2.l) < (~s) >*3 = < font=!'Bold' bbox=F'l ..' s=RM'.*)\s*' > | (1.l < 3.l)\[/code\]where\[code\]element names are left unspecifiedfont, bbox and s are attributesV = string, N = string? :: V -> bool -- value contains string! :: V -> bool = not . (?) -- value does not contain string~ :: N -> bool -- value of attribute N is empty or whitespaceF :: V -> [(N, float)] -- extracts a list of named floats from valueRM :: V -> bool -- value matches regular expression\[/code\]Code\[code\]open System.Xml.Linqopen System.Collections.Genericlet inline (-?-) a b = (a : string).Contains blet inline (~~) s = s |> String.IsNullOrWhiteSpacelet inline (!>) x = ( ^a : (static member op_Implicit : ^b -> ^a) x )let inline (@) (x : XElement) n = let a = x.Attribute(!> n) in if a <> null then a.Value else String.Emptylet inline (@<) (x : XElement) n v = x.SetAttributeValue(!> n, v)type X = XElementtype XE = XElement IEnumeratorlet inline bbox e = (e @ "bbox") |> fun s -> s.Split [| ' ' |] |> Seq.map float |> Seq.toListlet inline left bbox = match bbox with l::_ -> l | _ -> nanlet mark n = let id = Guid.NewGuid() in Seq.iter <| fun e -> e @< "class-" + n <| idlet speaker (n : XE) = let c1 = n.Current let f1 = c1 @ "font" if f1 -?- "Bold" then let l1 = c1 |> bbox |> left while n.MoveNext() && ~~(n.Current @ "s") do () let c2 = n.Current let f2 = c2 @ "font" if f2 -?- "Bold" |> not then let l2 = c2 |> bbox |> left if l1 < l2 then let s2 = c2 @ "s" if s2 -?- "(" then if s2 -?- ")" then [c1; c2] |> mark "speaker" while n.MoveNext() && ~~(n.Current @ "s") do () let c3 = n.Current let f3 = c3 @ "font" if f3 -?- "Bold" |> not then let l3 = c3 |> bbox |> left if l1 < l3 then let s3 = c3 @ "s" if s3 -?- ")" then [c1; c2; c3] |> mark "speaker" let test (x : XElement) = let spans = x.Descendants(!> "span") |> Seq.toArray for i = 29 to spans.Length - 1 do let n = (spans |> Seq.skip i).GetEnumerator() n.MoveNext() |> ignore speaker n\[/code\]Input\[code\]<block bbox="63.2999 550.846 246.865 561.875"> <line bbox="63.2999 550.846 246.865 561.875"> <span bbox="63.2999 550.846 189.001 561.875" font="TimesNewRoman,Bold" size="9.96" s="Dr. Frank-Walter Steinmeier " /> <span bbox="189 550.846 246.865 561.875" font="TimesNewRoman" size="9.96" s="(SPD) . . . . . ." /> </line></block><block bbox="63.2999 567.766 246.875 578.796"> <line bbox="63.2999 567.766 246.875 578.796"> <span bbox="63.2999 567.766 136.004 578.796" font="TimesNewRoman,Bold" size="9.96" s="Rainer Br