(**************************************************************************) (* *) (* Copyright (C) Jean-Christophe Filliatre *) (* *) (* This software is free software; you can redistribute it and/or *) (* modify it under the terms of the GNU Library General Public *) (* License version 2.1, with the special exception on linking *) (* described in file LICENSE. *) (* *) (* This software is distributed in the hope that it will be useful, *) (* but WITHOUT ANY WARRANTY; without even the implied warranty of *) (* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *) (* *) (**************************************************************************) (* Suffix trees *) (* Following Ukkonen's algorithm as described in Gusfield's book "Algorithms on Strings, Trees and Sequences" *) module type Alphabet = sig type t val dummy : t val equal : t -> t -> bool val compare : t -> t -> int val print : Format.formatter -> t -> unit type s val length : s -> int val get : s -> int -> t end module type Branching = sig type key type 'a t val create : unit -> 'a t val add : 'a t -> key -> 'a -> unit val find : 'a t -> key -> 'a val iter : (key -> 'a -> unit) -> 'a t -> unit end let debug = false module Make(A : Alphabet)(B : Branching with type key = A.t) = struct type node = { mutable label_start : int; mutable label_end : int; mutable father : node; mutable node_type : node_type; mutable suffix_link : node; path_position : int } and node_type = Leaf of int Branch of node B.t type t = { tree_string : A.s; tree_root : node; } open Format let print fmt t = let m = A.length t.tree_string in let rec print_node n depth = if depth > 0 then begin for i = 1 to depth - 1 do printf "|"; done; fprintf fmt "+"; for i = n.label_start to n.label_end do A.print fmt (if i == m+1 then A.dummy else A.get t.tree_string (i-1)) done; if debug then fprintf fmt " \t\t\t(%d,%d)" n.label_start n.label_end; fprintf fmt "@\n" end; match n.node_type with Leaf _ -> () Branch b -> B.iter (fun _ n -> print_node n (depth+1)) b in print_node t.tree_root 0 (* a dummy node that will never appear in returned suffix trees *) let rec dummy_node = { label_start = 0; label_end = 0; father = dummy_node; node_type = Leaf (-1); suffix_link = dummy_node; path_position = 0 } let create_node ?(suffix_link=dummy_node) ~father ls le nt pp = { label_start = ls; label_end = le; father = father; node_type = nt; suffix_link = suffix_link; path_position = pp } type rule = Rule1 Rule2 Rule3 type position = { mutable pos_node : node; mutable pos_in_edge : int } type extension_rule_2 = New_son Split (* Ukkonen's algorithm *) let create s = let m = A.length s in (* we do not build [s$]; instead we override [get] to cover [1..m+1] *) let get i = assert (1 <= i && i <= m+1); if i <= m then A.get s (i-1) else A.dummy in (* the root tree and the first node *) let root_sons = B.create () in let rec root = { label_start = 0; label_end = 0; father = root; node_type = Branch root_sons; suffix_link = root; path_position = 0 } in let tree = { tree_string = s; tree_root = root } in let n = create_node ~suffix_link:root ~father:root 1 (m+1) (Leaf 1) 1 in B.add root_sons (get 1) n; (* label end: during construction label end of a leaf is [!e] *) let e = ref 0 in let label_end n = match n.node_type with Leaf _ -> !e Branch _ -> n.label_end in (* DEBUG *) let print fmt t = let m = A.length t.tree_string in let rec print_node n depth = let e = label_end n in if depth > 0 then begin for i = 1 to depth - 1 do printf "|"; done; fprintf fmt "+"; for i = n.label_start to e do A.print fmt (if i == m+1 then A.dummy else A.get t.tree_string (i-1)) done; if debug then fprintf fmt " \t\t\t(%d,%d)" n.label_start e; fprintf fmt "@\n" end; match n.node_type with Leaf _ -> () Branch b -> B.iter (fun _ n -> print_node n (depth+1)) b in print_node t.tree_root 0 in (* END DEBUG *) let suffix_less = ref dummy_node in let set_suffix_link n = if !suffix_less != dummy_node then begin !suffix_less.suffix_link <- n; suffix_less := dummy_node end in let pos = { pos_node = root; pos_in_edge = 0 } in let label_length n = label_end n - n.label_start + 1 in let last_char_in_edge () = if debug then printf "last_char_in_edge = %b@." (pos.pos_in_edge = label_length pos.pos_node - 1); pos.pos_in_edge = label_length pos.pos_node - 1 in (* select the right branch *) let find_son n c = match n.node_type with Leaf _ -> dummy_node Branch b -> (try B.find b c with Not_found -> dummy_node) in (* tracing a string down the tree (updating [pos]) *) let trace_single_edge (gs,ge) ~skip = let cont_node = find_son pos.pos_node (get gs) in if cont_node == dummy_node then begin pos.pos_in_edge <- label_length pos.pos_node - 1; 0, true end else begin pos.pos_node <- cont_node; let length = label_length pos.pos_node in let str_len = ge - gs + 1 in if skip then begin if length <= str_len then begin pos.pos_in_edge <- length - 1; length, not (length < str_len) end else begin pos.pos_in_edge <- str_len - 1; str_len, true end end else begin let length = if str_len < length then str_len else length in pos.pos_in_edge <- 1; let rec loop chars_found = if pos.pos_in_edge < length then begin if not (A.equal (get (pos.pos_node.label_start + pos.pos_in_edge)) (get (gs + pos.pos_in_edge))) then begin pos.pos_in_edge <- pos.pos_in_edge - 1; chars_found, true end else begin pos.pos_in_edge <- pos.pos_in_edge + 1; loop (succ chars_found) end end else begin pos.pos_in_edge <- pos.pos_in_edge - 1; chars_found, not (chars_found < str_len) end in loop 1 end end in let trace_string gamma ~skip = let rec trace chars_found ((gs,ge) as g) = pos.pos_in_edge <- 0; let edge_chars_found, search_done = trace_single_edge g ~skip in let chars_found = chars_found + edge_chars_found in if not search_done then trace chars_found (gs + edge_chars_found, ge) else chars_found in trace 0 gamma in (* following the suffix link *) let follow_suffix_link () = if pos.pos_node != root then begin if pos.pos_node.suffix_link == dummy_node || not (last_char_in_edge ()) then begin if pos.pos_node.father == root then pos.pos_node <- root else begin let s = pos.pos_node.label_start in let gamma = (s, s + pos.pos_in_edge) in pos.pos_node <- pos.pos_node.father.suffix_link; ignore (trace_string gamma ~skip:true) end end else begin pos.pos_node <- pos.pos_node.suffix_link; pos.pos_in_edge <- label_length pos.pos_node - 1 end end in (* node creation (Rule 2) *) let apply_extension_rule_2 ls le path_pos edge_pos = function New_son -> if debug then printf "rule 2: new leaf (%d,%d)@." ls le; let leaf = create_node ls le ~father:pos.pos_node (Leaf path_pos) path_pos in begin match pos.pos_node.node_type with Branch b -> B.add b (get ls) leaf; leaf Leaf _ -> assert false end Split -> if debug then printf "rule 2: split (%d,%d)@." ls le; let node = pos.pos_node in let b = B.create () in let intl = create_node node.label_start (node.label_start + edge_pos) ~father:node.father (Branch b) node.path_position in node.label_start <- node.label_start + edge_pos + 1; let leaf = create_node ~father:intl ls le (Leaf path_pos) path_pos in B.add b (get node.label_start) node; B.add b (get ls) leaf; node.father <- intl; begin match intl.father.node_type with Branch b -> B.add b (get intl.label_start) intl; intl Leaf _ -> assert false end in (* Single Extension Algorithm (SEA) *) let sea j0 i_1 after_rule_3 = if debug then begin printf "%a@." print tree; printf "extension: %d phase+1: %d" j0 i_1; if after_rule_3 then printf " starting at (%d,%d | %d)\n" pos.pos_node.label_start (label_end pos.pos_node) pos.pos_in_edge else printf " followed from (%d,%d | %d)\n" pos.pos_node.label_start (label_end pos.pos_node) pos.pos_in_edge end; let j = ref j0 in if not after_rule_3 then follow_suffix_link (); let chars_found = if pos.pos_node == root then begin trace_string (!j,i_1) ~skip:false end else begin j := i_1; if last_char_in_edge () then begin let tmp = find_son pos.pos_node (get i_1) in if tmp != dummy_node then begin pos.pos_node <- tmp; pos.pos_in_edge <- 0; 1 end else 0 end else if A.equal (get (pos.pos_node.label_start + pos.pos_in_edge + 1)) (get i_1) then begin pos.pos_in_edge <- pos.pos_in_edge + 1; 1 end else 0 end in if debug then printf "chars_found = %d@." chars_found; if chars_found = i_1 - !j + 1 then begin (* Rule 3 applies *) if debug then printf "rule 3 (%d,%d)@." !j i_1; set_suffix_link pos.pos_node.father; Rule3 end else if last_char_in_edge () || pos.pos_node == root then match pos.pos_node.node_type with Branch b -> ignore (apply_extension_rule_2 (!j + chars_found) i_1 j0 0 New_son); set_suffix_link pos.pos_node; Rule2 Leaf _ -> Rule1 else begin let tmp = apply_extension_rule_2 (!j+chars_found) i_1 j0 pos.pos_in_edge Split in if !suffix_less != dummy_node then !suffix_less.suffix_link <- tmp; if label_length tmp = 1 && tmp.father == root then begin tmp.suffix_link <- root; suffix_less := dummy_node end else suffix_less := tmp; pos.pos_node <- tmp; Rule2 end in (* Ukkonen main loop *) let extension = ref 2 in let last_rule_is_3 = ref false in for i = 2 to m do (* phase [i+1] *) e := i+1; let rec spa () = if !extension <= i+1 then begin let rule = sea !extension (i+1) !last_rule_is_3 in last_rule_is_3 := (rule = Rule3); if not !last_rule_is_3 then begin incr extension; spa () end end in spa () done; let rec set_leaf_end n = match n.node_type with Leaf _ -> n.label_end <- m+1 Branch b -> B.iter (fun _ n -> set_leaf_end n) b in set_leaf_end root; tree let find t s = let m = A.length t.tree_string in let p = A.length s in let get i = assert (1 <= i && i <= m+1); if i <= m then A.get t.tree_string (i-1) else A.dummy in let find_son n c = match n.node_type with Leaf _ -> raise Not_found Branch b -> B.find b c in let rec descend n j = let rec descend_edge j k = if j < p && k <= n.label_end && A.equal (get k) (A.get s j) then descend_edge (succ j) (succ k) else if j = p then { pos_node = n; pos_in_edge = k } else if k > n.label_end then descend (find_son n (A.get s j)) j else raise Not_found in descend_edge j n.label_start in let n0 = match t.tree_root.node_type with Branch b -> B.find b (if p = 0 then A.dummy else A.get s 0) Leaf _ -> assert false in descend n0 0 let substring t s = if A.length s = 0 then 0 else let p = find t s in p.pos_node.path_position - 1 let leaves f p = let rec iter n = match n.node_type with Leaf j -> f (j - 1) Branch b -> B.iter (fun _ n -> iter n) b in iter p.pos_node end (* some usual branching implementations *) module Bmap(X : Map.OrderedType) : Branching with type key = X.t = struct type key = X.t module M = Map.Make(X) type 'a t = 'a M.t ref let create () = ref M.empty let add m k v = m := M.add k v !m let find m k = M.find k !m let iter f m = M.iter f !m end module Barray(A : sig val size : int end) : Branching with type key = int = struct type key = int type 'a t = 'a option array let create () = Array.create A.size None let add m k v = m.(k) <- Some v let find m k = match m.(k) with Some v -> v None -> raise Not_found let iter f m = Array.iteri (fun k v -> match v with None -> () Some v -> f k v) m end module Blist(X : sig type t val equal : t -> t -> bool end) : Branching with type key = X.t = struct type key = X.t type 'a t = (key * 'a) list ref let create () = ref [] let add m k v = let rec replace = function [] -> raise Not_found (a,_ as p) :: l -> if X.equal k a then (a,v) :: l else p :: replace l in m := try replace !m with Not_found -> (k,v) :: !m let find m k = let rec lookup = function [] -> raise Not_found (a,b) :: l -> if X.equal k a then b else lookup l in lookup !m let iter f m = List.iter (fun (a,b) -> f a b) !m end module Bhash(A : sig val size : int end)(X : Hashtbl.HashedType) : Branching with type key = X.t = struct type key = X.t module H = Hashtbl.Make(X) type 'a t = 'a H.t let create () = H.create A.size let add m k v = H.replace m k v let find m k = H.find m k let iter = H.iter end module CharString = struct type t = char let dummy = Char.chr 0 let equal x y = x == y let compare x y = Char.code x - Char.code y let hash c = Char.code c let print fmt c = Format.fprintf fmt "%c" c type s = string let length = String.length let get = String.unsafe_get end module IntArray = struct type t = int let dummy = -1 let equal x y = x == y let compare = compare let hash c = Char.code c let print fmt c = Format.fprintf fmt "%d" c type s = int array let length = Array.length let get = Array.unsafe_get end (* manually defunctorized code for type string *) module Ukkonen = struct (*module B = Bhash(struct let size = 17 end)(CharString)*) module B = Bmap(CharString) type node = { mutable label_start : int; mutable label_end : int; mutable father : node; mutable node_type : node_type; mutable suffix_link : node; path_position : int; } and node_type = Leaf of int Branch of node B.t type t = { tree_string : string; tree_root : node; } open Format let print fmt t = let m = String.length t.tree_string in let rec print_node n depth = if depth > 0 then begin for i = 1 to depth - 1 do printf "|"; done; fprintf fmt "+"; for i = n.label_start to n.label_end do fprintf fmt "%c" (if i == m+1 then '$' else String.unsafe_get t.tree_string (i-1)) done; if debug then fprintf fmt " \t\t\t(%d,%d)" n.label_start n.label_end; fprintf fmt "@\n" end; match n.node_type with Leaf _ -> () Branch b -> B.iter (fun _ n -> print_node n (depth+1)) b in print_node t.tree_root 0 (* a dummy node that will never appear in returned suffix trees *) let rec dummy_node = { label_start = 0; label_end = 0; father = dummy_node; node_type = Leaf (-1); suffix_link = dummy_node; path_position = 0 } let create_node ?(suffix_link=dummy_node) ~father ls le nt pp = { label_start = ls; label_end = le; father = father; node_type = nt; suffix_link = suffix_link; path_position = pp } type rule = Rule1 Rule2 Rule3 type position = { mutable pos_node : node; mutable pos_in_edge : int } type extension_rule_2 = New_son Split (* Ukkonen's algorithm *) let create s = let m = String.length s in let s0 = String.make (m+2) '\000' in String.blit s 0 s0 1 m; (* we do not build [s$]; instead we override [get] to cover [1..m+1] *) let get i = assert (1 <= i && i <= m+1); String.unsafe_get s0 i in (* the root tree and the first node *) let root_sons = B.create () in let rec root = { label_start = 0; label_end = 0; father = root; node_type = Branch root_sons; suffix_link = root; path_position = 0 } in let tree = { tree_string = s; tree_root = root } in let n = create_node ~suffix_link:root ~father:root 1 (m+1) (Leaf 1) 1 in B.add root_sons (get 1) n; (* label end: during construction label end of a leaf is [!e] *) let e = ref 0 in let label_end n = match n.node_type with Leaf _ -> !e Branch _ -> n.label_end in (* DEBUG *) let print fmt t = let m = String.length t.tree_string in let rec print_node n depth = let e = label_end n in if depth > 0 then begin for i = 1 to depth - 1 do printf "|"; done; fprintf fmt "+"; for i = n.label_start to e do fprintf fmt "%c" (if i == m+1 then '$' else String.unsafe_get t.tree_string (i-1)) done; if debug then fprintf fmt " \t\t\t(%d,%d)" n.label_start e; fprintf fmt "@\n" end; match n.node_type with Leaf _ -> () Branch b -> B.iter (fun _ n -> print_node n (depth+1)) b in print_node t.tree_root 0 in (* END DEBUG *) let suffix_less = ref dummy_node in let set_suffix_link n = if !suffix_less != dummy_node then begin !suffix_less.suffix_link <- n; suffix_less := dummy_node end in let pos = { pos_node = root; pos_in_edge = 0 } in let label_length n = label_end n - n.label_start + 1 in let last_char_in_edge () = if debug then printf "last_char_in_edge = %b@." (pos.pos_in_edge = label_length pos.pos_node - 1); pos.pos_in_edge = label_length pos.pos_node - 1 in (* select the right branch *) let find_son n c = match n.node_type with Leaf _ -> dummy_node Branch b -> (try B.find b c with Not_found -> dummy_node) in (* tracing a string down the tree (updating [pos]) *) let trace_single_edge (gs,ge) ~skip = let cont_node = find_son pos.pos_node s0.[gs] in if cont_node == dummy_node then begin pos.pos_in_edge <- label_length pos.pos_node - 1; 0, true end else begin pos.pos_node <- cont_node; let length = label_length pos.pos_node in let str_len = ge - gs + 1 in if skip then begin if length <= str_len then begin pos.pos_in_edge <- length - 1; length, not (length < str_len) end else begin pos.pos_in_edge <- str_len - 1; str_len, true end end else begin let length = if str_len < length then str_len else length in pos.pos_in_edge <- 1; let rec loop chars_found = if pos.pos_in_edge < length then begin if not (s0.[pos.pos_node.label_start + pos.pos_in_edge] == s0.[gs + pos.pos_in_edge]) then begin pos.pos_in_edge <- pos.pos_in_edge - 1; chars_found, true end else begin pos.pos_in_edge <- pos.pos_in_edge + 1; loop (succ chars_found) end end else begin pos.pos_in_edge <- pos.pos_in_edge - 1; chars_found, not (chars_found < str_len) end in loop 1 end end in let trace_string gamma ~skip = let rec trace chars_found ((gs,ge) as g) = pos.pos_in_edge <- 0; let edge_chars_found, search_done = trace_single_edge g ~skip in let chars_found = chars_found + edge_chars_found in if not search_done then trace chars_found (gs + edge_chars_found, ge) else chars_found in trace 0 gamma in (* following the suffix link *) let follow_suffix_link () = if pos.pos_node != root then begin if pos.pos_node.suffix_link == dummy_node || not (last_char_in_edge ()) then begin if pos.pos_node.father == root then pos.pos_node <- root else begin let s = pos.pos_node.label_start in let gamma = (s, s + pos.pos_in_edge) in pos.pos_node <- pos.pos_node.father.suffix_link; ignore (trace_string gamma ~skip:true) end end else begin pos.pos_node <- pos.pos_node.suffix_link; pos.pos_in_edge <- label_length pos.pos_node - 1 end end in (* node creation (Rule 2) *) let apply_extension_rule_2 ls le path_pos edge_pos = function New_son -> if debug then printf "rule 2: new leaf (%d,%d)@." ls le; let leaf = create_node ls le ~father:pos.pos_node (Leaf path_pos) path_pos in begin match pos.pos_node.node_type with Branch b -> B.add b s0.[ls] leaf; leaf Leaf _ -> assert false end Split -> if debug then printf "rule 2: split (%d,%d)@." ls le; let node = pos.pos_node in let b = B.create () in let intl = create_node node.label_start (node.label_start + edge_pos) ~father:node.father (Branch b) node.path_position in node.label_start <- node.label_start + edge_pos + 1; let leaf = create_node ~father:intl ls le (Leaf path_pos) path_pos in B.add b s0.[node.label_start] node; B.add b s0.[ls] leaf; node.father <- intl; begin match intl.father.node_type with Branch b -> B.add b s0.[intl.label_start] intl; intl Leaf _ -> assert false end in (* Single Extension Algorithm (SEA) *) let sea j0 i_1 after_rule_3 = if debug then begin printf "%a@." print tree; printf "extension: %d phase+1: %d" j0 i_1; if after_rule_3 then printf " starting at (%d,%d | %d)\n" pos.pos_node.label_start (label_end pos.pos_node) pos.pos_in_edge else printf " followed from (%d,%d | %d)\n" pos.pos_node.label_start (label_end pos.pos_node) pos.pos_in_edge end; let j = ref j0 in if not after_rule_3 then follow_suffix_link (); let chars_found = if pos.pos_node == root then begin trace_string (!j,i_1) ~skip:false end else begin j := i_1; if last_char_in_edge () then begin let tmp = find_son pos.pos_node s0.[i_1] in if tmp != dummy_node then begin pos.pos_node <- tmp; pos.pos_in_edge <- 0; 1 end else 0 end else if s0.[pos.pos_node.label_start + pos.pos_in_edge + 1] == s0.[i_1] then begin pos.pos_in_edge <- pos.pos_in_edge + 1; 1 end else 0 end in if debug then printf "chars_found = %d@." chars_found; if chars_found = i_1 - !j + 1 then begin (* Rule 3 applies *) if debug then printf "rule 3 (%d,%d)@." !j i_1; set_suffix_link pos.pos_node.father; Rule3 end else if last_char_in_edge () || pos.pos_node == root then match pos.pos_node.node_type with Branch b -> ignore (apply_extension_rule_2 (!j + chars_found) i_1 j0 0 New_son); set_suffix_link pos.pos_node; Rule2 Leaf _ -> Rule1 else begin let tmp = apply_extension_rule_2 (!j+chars_found) i_1 j0 pos.pos_in_edge Split in if !suffix_less != dummy_node then !suffix_less.suffix_link <- tmp; if label_length tmp = 1 && tmp.father == root then begin tmp.suffix_link <- root; suffix_less := dummy_node end else suffix_less := tmp; pos.pos_node <- tmp; Rule2 end in (* Ukkonen main loop *) let extension = ref 2 in let last_rule_is_3 = ref false in for i = 2 to m do (* phase [i+1] *) e := i+1; let rec spa () = if !extension <= i+1 then begin let rule = sea !extension (i+1) !last_rule_is_3 in last_rule_is_3 := (rule = Rule3); if not !last_rule_is_3 then begin incr extension; spa () end end in spa () done; let rec set_leaf_end n = match n.node_type with Leaf _ -> n.label_end <- m+1 Branch b -> B.iter (fun _ n -> set_leaf_end n) b in set_leaf_end root; tree let dummy_char = Char.chr 0 let find t s = let m = String.length t.tree_string in let p = String.length s in let get i = assert (1 <= i && i <= m+1); if i <= m then String.unsafe_get t.tree_string (i-1) else dummy_char in let find_son n c = match n.node_type with Leaf _ -> raise Not_found Branch b -> B.find b c in let rec descend n j = let rec descend_edge j k = if j < p && k <= n.label_end && (get k) == (String.unsafe_get s j) then descend_edge (succ j) (succ k) else if j = p then { pos_node = n; pos_in_edge = k } else if k > n.label_end then descend (find_son n (String.unsafe_get s j)) j else raise Not_found in descend_edge j n.label_start in let n0 = match t.tree_root.node_type with Branch b -> B.find b (if p = 0 then dummy_char else String.unsafe_get s 0) Leaf _ -> assert false in descend n0 0 let substring t s = if String.length s = 0 then 0 else let p = find t s in p.pos_node.path_position - 1 let leaves f p = let rec iter n = match n.node_type with Leaf j -> f (j - 1) Branch b -> B.iter (fun _ n -> iter n) b in iter p.pos_node (* ne fonctionne que si [B = Bmap] car il faut que [iter] soit lexic. *) let suffix_array t = let m = String.length t.tree_string in let rec iter n = match n.node_type with Leaf j -> printf "%s\n" (String.sub t.tree_string (j-1) (m-j+1)) Branch b -> B.iter (fun _ n -> iter n) b in iter t.tree_root end
This document was generated using caml2html