(**************************************************************************) (* *) (* Copyright (C) Jean-Christophe Filliatre *) (* *) (* This software is free software; you can redistribute it and/or *) (* modify it under the terms of the GNU Library General Public *) (* License version 2, with the special exception on linking *) (* described in file LICENSE. *) (* *) (* This software is distributed in the hope that it will be useful, *) (* but WITHOUT ANY WARRANTY; without even the implied warranty of *) (* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *) (* *) (**************************************************************************) (*s {\bf Anagrams.} The following program finds all the anagrams of a given set of characters among a dictionary. Such a dictionary can be built by the following program given a list of files containing words (one per line). *) (*s The dictionary is implemented as a \emph{trie}. It is a multi-branching tree, where branches are labelled with characters. Each node contains a boolean which says if the word corresponding to the path from the root belongs to the dictionary. The branches are implemented as maps from characters to dictionaries. *) module Cmap = Map.Make(struct type t = char let compare = compare end) type tree = Node of bool * tree Cmap.t let empty = Node (false, Cmap.empty) (*s Insertion of a new word in the trie is just a matter of descending in the tree, taking the branches corresponding to the successive characters. Each time a branch does not exist, we continue the insertion in a new empty tree. When the insertion is done in the subtree, we update the branching to the new subtree. *) let add t w = let n = String.length w in let rec addrec i (Node (b,m) as t) = if i = n then if b then t else Node (true,m) else let c = w.[i] in let br = try Cmap.find c m with Not_found -> empty in let t' = addrec (succ i) br in Node (b, Cmap.add c t' m) in addrec 0 t (*s Even if it is not necessary, here is the function [mem] to tests whether a word belongs or not to the dictionary. *) let mem t w = let n = String.length w in let rec look i (Node (b,m)) = if i = n then b else try look (succ i) (Cmap.find w.[i] m) with Not_found -> false in look 0 t (*s The algorithm for anagrans is the following. We start from the root of the tree with all the initial characters. Then, for each \emph{distinct} character [c], we descend in the corresponding branch, and apply recursively the algorithm with \emph{one occurrence} of [c] being removed. When the collection of characters is empty, we simply test the boolean at the current node. Whenever a branch is missing, we stop the exploration. It appears that we need to deal with \emph{multi-sets} of characters. Indeed, we have to keep the collection of characters which have not yet been examined, and it may contain repetitions. *) (*s Multi-sets of characters are implemented as maps from characters to positive integers. The operations of insertion and deletion are easily implemented. We also provide a function [ms_of_string] to build the multi-set corresponding to a given string. *) let ms_add c m = try let n = Cmap.find c m in Cmap.add c (succ n) m with Not_found -> Cmap.add c 1 m let ms_remove c m = let n = Cmap.find c m in if n = 1 then Cmap.remove c m else Cmap.add c (pred n) m let ms_of_string w = let n = String.length w in let rec add i = if i = n then Cmap.empty else ms_add w.[i] (add (succ i)) in add 0 (*s Then implementing the above algorithm is rather easy. During the exploration, we keep three values: first, the current path [pref] from the root of the initial tree, in reverse order; secondly, the current node being examined, [(b,m)]; and finally, the current multi-set of characters [s]. *) let subset = ref true let rec print_prefix = function | [] -> () | c::l -> print_prefix l; print_char c let anagram d w = let rec traverse pref (Node (b,m)) s = if b && (s = Cmap.empty || !subset) then begin print_prefix pref; print_newline () end; Cmap.iter (fun c _ -> try traverse (c::pref) (Cmap.find c m) (ms_remove c s) with Not_found -> ()) s in traverse [] d (ms_of_string w) (*s Building the dictionary. The function [add_one_file] read all the words contained in file [file] and inserts them in the tree [t]. Then function [build_dict] constructs the whole dictionary by successively inserting the words for the given list of files. *) let add_one_file t file = Printf.printf "Reading %s\n" file; flush stdout; let ch = open_in file in let rec read t = try let w = input_line ch in read (add t w) with End_of_file -> t in let t' = read t in close_in ch; t' let build_dict = List.fold_left add_one_file empty (*s The following function [print_all] prints all the words of a given dictionary. Only used for checks (option \texttt{-a}). *) let print_all d = let rec traverse pref (Node (b, m)) = if b then begin print_prefix pref; print_newline () end; Cmap.iter (fun c t -> traverse (c::pref) t) m in traverse [] d (*s The main program. It mainly provides two ways of invoking the program: first, the option \texttt{-b} will build the dictionary from the given files and put it in the file ["dict.out"]; secondly, the program invoked with a word on the command line will print all the anagrams for this word. Option \texttt{-e} specifies exact anagrams (i.e., with all characters used). *) let output_dict d = let ch = open_out "dict.out" in output_value ch d; close_out ch let input_dict () = let ch = open_in "dict.out" in let d = input_value ch in close_in ch; d let usage () = prerr_endline "usage:"; prerr_endline " anagram -b files"; prerr_endline " anagram [-e] word" let main () = match List.tl (Array.to_list Sys.argv) with | [] | "-h" :: _ -> usage () | "-a" :: _ -> let d = input_dict () in print_all d | "-b" :: files -> let d = build_dict files in output_dict d | "-e" :: w :: _ -> subset := false; let d = input_dict () in anagram d w | w :: _ -> let d = input_dict () in anagram d w let _ = Printexc.catch main ()