【问题标题】:string difference pretty-printer for OUnit.assert_equalOUnit.assert_equal 的字符串差异漂亮打印机
【发布时间】:2014-04-08 20:28:13
【问题描述】:

OUnit.assert_equal ~pp_diff 允许漂亮地打印预期/实际值差异,OUnitDiff 似乎为集合提供了差异。

是否有用于字符串值的库存pp_diff?理想情况下,它会尽最大努力将差异扩展到最接近的 UTF-8 序列边界。

即使是常见的前缀/后缀消除也比没有好。

【问题讨论】:

  • 你可能会从patdiff中获取一些元素;或者创建一个编辑距离矩阵并获取回溯以获取已编辑的字符串(如果它们很短)。

标签: ocaml ounit


【解决方案1】:

一个有趣的早晨挑战。

type move = Same | Add | Del

let edit_distance_matrix a b =
  (* The usual dynamic edit distance algorithm, except we keep
     a complete matrix of moves to be able to step back and see which
     operations can turn [sa] into [sb].

     This is not very efficient: we keep the complete matrices of
     distances (costs) and moves. One doesn't need to know the move
     for all cases of the matrix, only those that are on the "best"
     path from begin to end; it would be better to recompute the moves
     along the path after the facts. There probably also exists
     a classic clever trick to apply the usual optimization of keeping
     only two rows of the matrix at any time, and still compute the
     best path along the way. 
  *)
  let la, lb = String.length a, String.length b in
  let m = Array.make_matrix (la + 1) (lb + 1) (-1) in
  let moves = Array.make_matrix (la + 1) (lb + 1) Same in
  m.(0).(0) <- 0;
  for i = 1 to la do
    m.(i).(0) <- i;
  done;
  for j = 1 to lb do
    m.(0).(j) <- j;
  done;
  for i = 1 to la do
    for j = 1 to lb do
      let best, move =
        if a.[i-1] = b.[j-1] then m.(i-1).(j-1), Same
        else
          if m.(i-1).(j) <= m.(i).(j-1)
          then m.(i-1).(j) + 1, Del
          else m.(i).(j-1) + 1, Add
      in
      m.(i).(j) <- best;
      moves.(i).(j) <- move;
    done;
  done;
  m, moves

let get m (i, j) = m.(i).(j)

let valid m pos =
  fst pos >= 0 && snd pos >= 0

let previous (i, j) = function
  | Same -> (i - 1, j - 1)
  | Add -> (i, j - 1)
  | Del -> (i - 1, j)

let cons _pos action = function
  | (action', n) :: rest when action = action' ->
    (action', n+1) :: rest
  | list -> (action, 1) :: list

(** walk back along the "best path", taking notes of changes to make
    as we go *)
let chunks moves =
  let la = Array.length moves - 1 in
  let lb = Array.length moves.(0) - 1 in
  let start = (la, lb) in
  let rec loop acc pos =
    let move = get moves pos in
    let next_pos = previous pos move in
    (* if the next position is not valid,
       the current move is a dummy move,
       and it must not be returned as part of [acc] *)
    if not (valid moves next_pos) then acc
    else loop (cons pos move acc) next_pos
  in loop [] start

(** print the list of changes in term of the original string

    We skip large parts of the string that are common, keeping only
    [context] characters on the sides to provide some context.
*)
let diff context sa sb =
  let cost, moves = edit_distance_matrix sa sb in
  let chks = chunks moves in
  let buf = Buffer.create cost.(String.length sa).(String.length sb) in
  let rec loop i j = function
    | [] -> ()
    | (Same, n) :: rest ->
      if n <= 2 * context then
        Buffer.add_substring buf sa i n
      else begin
        Buffer.add_substring buf sa i context;
        Buffer.add_string buf "...\n...";
        Buffer.add_substring buf sa (i + n - context) context;
      end;
      loop (i + n) (j + n) rest
    | (Add, n) :: rest ->
      begin
        Buffer.add_string buf "[+";
        Buffer.add_substring buf sb j n;
        Buffer.add_char buf ']';
      end;
      loop i (j + n) rest 
    | (Del, n) :: rest ->
      begin
        Buffer.add_string buf "[-";
        Buffer.add_substring buf sa i n;
        Buffer.add_char buf ']';
      end;
      loop (i + n) j rest 
  in
  begin
    try loop 0 0 chks with _ -> ()
  end;
  Buffer.contents buf

测试:

# print_endline @@ diff 4
    "le gros chat mange beaucoup de croquettes au saumon"
    "le chat maigre mange peu de croquettes au saumon"
  ;;
le[- gros] chat[+ maigre] mange [+p][-b]e[-auco]u[-p] de ...
...umon

【讨论】:

  • 不,没有办法保留两行并获得回溯。使用我们在上一份工作中开发的算法可以提高效率——数据结构是基于对角线的而不是基于行的——但是由于必要的 if 语句的数量,它在像这样的小矩阵中没有帮助——它用于分子比对。
  • 我们可以做的是存储两行,不是移动,而是路径(迄今为止最好的路径):在最坏的情况下这仍然是O(|a|*|b|),但我们有一个压缩在实践中效果相对较好的路径技术,因此实际输入的内存消耗应该小得多。
  • 当然,我明白了。大概这样就不会存储与单元格关联的每个前缀,而是一个合理的子集?我希望这在合理的输入上也能很好地工作。关于路径的压缩技术,您指的是 trie 吗?
  • 仅在我的代码后面使用的分块表示:压缩相同性质的连续移动。顺便说一句,感谢有关两行+路径案例的信息,我承认我从来都不是动态编程方面的专家。
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2013-07-07
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2015-04-05
相关资源
最近更新 更多