(ns codescene.features.code-coverage.data-validator.file-set-comparison
  "Functions for comparing lists of files in coverage data
  definitions. The goal here is to quickly decide if two files refer
  to mostly the same files or not. 

  The motivation for this is to use less memory: we want to be able to
  load a compact representation of the files in a data
  definition. Here, we store the list of files as a tree, so that
  diretory names are not repeated across hundreds or thousands of
  files."
  (:require
   [clojure.set :as set]
   [clojure.string :as str]
   [clojure.edn :as edn]))

(defn paths-to-tree
  [paths]
  (reduce
   (fn [acc p]
     (let [path-parts (filter identity  (str/split p  #"/"))
           map-path (drop-last path-parts)]
       (if (get-in acc map-path)
         (update-in acc (concat map-path [:files]) conj (last path-parts))
         (assoc-in acc map-path  {:files [(last path-parts)]}))))
   {}
   paths))

(defn lazy-leaf-count
  [nodes]
  (if (empty? nodes)
    '()
    (let [n (first nodes)
          subdirs (keep (fn [[k v]] (when (string? k) v)) n)]
      (cons
       (count (:files n))
       (concat
         (lazy-leaf-count subdirs)
         (lazy-leaf-count (rest nodes)))))))

(defn mismatched-directory-keys
  [matching-directory-keys t1-directory-key-set t2-directory-key-set]
  (mapv
    #(set/select (complement matching-directory-keys) %)
    [t1-directory-key-set t2-directory-key-set]))


(defn compare-trees
  "Given a list of pairs of trees to compare, returns a lazy seq of
  comparison values which then can be totaled. Comparison values are
  three-item tuples where the first value is the number of files that
  exist in both trees, the second is the number of files that are
  different in the first tree, and the third is the number of files
  missing in the second tree. These numbers can then be used to
  understand what has changed. 

  - [50 50 50] -> 50% files of the files are the same, the other 50% were renamed.
  - [50 0  25] -> The first tree is a subset of the second. All the files in the first 
                  tree are present in the second, but the second tree has some more 
                  files as well."
  [comparison-pairs]
  (lazy-seq
   (if (empty? comparison-pairs)
     '()
     (let [[t1 t2 :as pair] (first comparison-pairs)
           [t1-files-set t2-files-set] (map (comp set :files) pair)
           [t1-dir-set t2-dir-set] (map (fn [t] (-> t keys set (disj :files)))  [t1 t2])
           matching-directory-keys (set/intersection t1-dir-set t2-dir-set)
           matching-file-keys (set/intersection t1-files-set t2-files-set)
           [t1-mismatched-dir-keys t2-mismatched-dir-keys] (mismatched-directory-keys matching-directory-keys t1-dir-set t2-dir-set)
           t1-mismatched-file-count (- (count t1-files-set) (count matching-file-keys))
           t2-mismatched-file-count (- (count t2-files-set) (count matching-file-keys))
           t1-missed-files-in-subdirs-count (reduce + (lazy-leaf-count (map #(get t1 %) t1-mismatched-dir-keys)))
           t2-missed-files-in-subdirs-count (reduce + (lazy-leaf-count (map #(get t2 %) t2-mismatched-dir-keys)))]
       (concat
        (cons
         [(count matching-file-keys)
          (+ t1-mismatched-file-count t1-missed-files-in-subdirs-count)
          (+ t2-mismatched-file-count t2-missed-files-in-subdirs-count)]
         (compare-trees (map #(vector (get t1 %) (get t2 %)) matching-directory-keys)))
        (compare-trees (rest comparison-pairs)))))))


(defn- tuple-add
  ([] 0)
  ([one] one)
  ([one two]
   (mapv (fn [a b] (+ a b)) one two))
  ([one two & more]
   (reduce
     (fn [acc-tup tup] (tuple-add acc-tup tup))
     (tuple-add one two)
     more)))



(defn path-diff-score
  [t1 t2]
  (apply tuple-add (compare-trees [[t1 t2]])))


(comment
;  (def f "/Users/joseph/Empear/src/codescene/onprem/code-coverage-data/code-coverage-33.edn")
;  (def f2 "/Users/joseph/Empear/src/codescene/onprem/code-coverage-data/code-coverage-34.edn")

  (defn mk-tr [f]
    (->> f  slurp edn/read-string (map :path) paths-to-tree ))
  
  (def t  (mk-tr "/Users/joseph/Downloads/code-coverage-55940.edn"))
  (def t2 (mk-tr "/Users/joseph/Downloads/code-coverage-55907.edn"))
  (def t3 (mk-tr "/Users/joseph/Downloads/code-coverage-54044.edn"))

  (count (compare-trees [[t3 t3]]))
  (time (path-diff-score t3 t3))

  ;
  )
