(ns codescene.features.code-coverage.data-validator
  (:require
   [clojure.string :as str]
   [codescene.features.code-coverage.data-validator.file-set-comparison :as file-set-comparison]
   [codescene.features.code-coverage.validation-messaging :as validation-messaging]
   [evolutionary-metrics.mining.vcs :as vcs]
   [taoensso.timbre :as log]))


(defmulti vcs-files-changed (fn [git-client _repo-path _commit-sha _analyzed-commit]
                              (if (fn? git-client) :function :command)))

(defmethod vcs-files-changed :command [git-client repo-path commit-sha analyzed-commit]
  (->> (vcs/files-changed git-client repo-path commit-sha analyzed-commit)
       (remove empty?)))

(defmethod vcs-files-changed :function [injected-fn repo-path commit-sha analyzed-commit]
  (injected-fn {:repo-path repo-path :commit-sha commit-sha :analyzed-commit analyzed-commit :vcs-equivalent :files-changed}))

(defmulti vcs-time-of-commit (fn [git-client _repo-path _analyzed-commit] (if (fn? git-client) :function :command)))
(defmethod vcs-time-of-commit :command [git-client repo-path analyzed-commit]
  (vcs/time-of-commit git-client repo-path analyzed-commit))
(defmethod vcs-time-of-commit :function [injected-fn repo-path analyzed-commit]
  (injected-fn {:repo-path repo-path :analyzed-commit analyzed-commit :vcs-equivalent :time-of-commit}))

(defn- subpath-compare
  "A (possibly) temporary way to avoid extra comparisons. We hope in the
  future that no one will use subpaths anymore, but in the short
  medium term (saying this in 2025) it still makes sense to run this check."
  [existing-subpath new-subpath]
  (cond (or (empty? existing-subpath) (empty? new-subpath)) :cannot-determine
        (= existing-subpath new-subpath) :supersede
        :else :not-supersede))


(defn- path-tree-compare
  [{existing-paths :path-tree} {new-paths :path-tree}]
  ;; This threshold is somewhat arbitrary and could be adjusted later.
  ;; We want this to return `true` when the two sets of files are
  ;; "mostly the same". 
  (let [same-files-threshold 0.85]
    (cond
      ;; Both are empty, no reason to keep the older file
      (and (empty? existing-paths) (empty? new-paths)) true

      ;; The new file is empty, but not the old one. We should
      ;; probably keep the older one, since the new one seems problematic.
      (empty? new-paths) false

      :else
      (let [[same different-existing different-new] (file-set-comparison/path-diff-score existing-paths new-paths)]
        (cond
          ;; No files are the same - does not supersede
          (zero? same) false

          ;; No files have changed - keep the new 
          (and (zero? different-existing) (zero? different-new)) true

          ;; Two data-defs matching roughly the same files. We
          ;; don't care about `different-new` here. If
          ;; `different-new` is much bigger, then `existing` was probably a subset of it.
          ;; If `different-new` is small, it still doesn't matter because the comparison to
          ;; `same-files-threshold` prevents a real miss.
          (< same-files-threshold (/ same (+ same different-existing)))  true
              
          :else false)))))

(defn data-def-supersedes?
  "Takes two code coverage data definitions and decides if they are
  equivalent, that is whether the more recent of the two should
  supersede the older one, or if both should be kept. 

  Both would need to be kept if they correspond to different coverage
  types or if they do not reference the same files.

  We consider that two data definitions correspond to the same files if 85%
  of the files are in both data definitions."
  [existing-dd new-dd]
  (let [required-identical-fields [:repo :metric]
        subpath-check (subpath-compare (:subpath existing-dd) (:subpath new-dd))]
    ;; If any of these are different, we definitely can't replace
    ;; the existing with the new one
    (cond
      (not= (select-keys existing-dd required-identical-fields)
            (select-keys new-dd required-identical-fields))
      false

      (= :supersede subpath-check) true
      (= :not-supersede subpath-check) false

      :else
      (path-tree-compare existing-dd new-dd))))


(defn replace-superseded-data-defs
  "We don't know which coverage-data-defs are still valid, or have
  been superseded by more recent data. 

  This function supposes that the `coverage-data-defs` are sorted by
  date, oldest first. It replaces the older version of any
  data-def that has been superseded. 

  In most cases, this should be called through
  `faster-replace-superseded-data-defs` which can help avoid a lot
  of unnecessary, repeated comparisons."
  [coverage-data-defs]
  (->> coverage-data-defs
       (reduce
         (fn [defs-to-keep data-def]
           ;; On each iteration, we start by appending the new item to
           ;; the head of the list and then we 
           ;; remove any previous items that should be superseded.
           (into [data-def] (remove #(data-def-supersedes? % data-def)) defs-to-keep))
         '())
       vec))


(defn sublists
  [data-defs]
  (vals (group-by (juxt :repo :metric)  data-defs)))

;(sublists [{:repo 1 :metric "line"} {:repo 1 :metric "branch"} {:repo 1 :metric "line"}])

(defn faster-replace-superseded-data-defs
  "This is an optimization around `replace-superseded-data-defs` that
  limits the number of comparisons required by breaking the original
  sequence into sub-sequences that are potentially comparable, thus
  eliminating many multiple comparisons.

  In the worst case of `replace-superseded-data-defs`, if no data def
  supersedes the first item added to the list, then every subsequent
  item is compared to the first. In cases where there are no items
  that match, then you end up with n^2 comparisons. This function helps by
  splitting n into multiple smaller ns when possible."
  [coverage-data-defs]
  (->> coverage-data-defs
       sublists
       (mapcat replace-superseded-data-defs)
       (sort-by :created-at)
       reverse
       vec))


(defmulti long-hash-for (fn [git-client _repo-path _short-hash]
                          (if (fn? git-client) :function :command)))

(defmethod long-hash-for :command [git-client repo-path short-hash]
  (vcs/to-long-hash git-client repo-path short-hash))

(defmethod long-hash-for :function [injected-fn repo-path short-hash]
  (injected-fn {:repo-path repo-path
                :short-hash short-hash
                :vcs-equivalent :to-long-hash}))


(def grouping-keys [:repo :repo-path :metric])

(defn- matches-known?
  [match-keys known-matches match-target]
  (let [extract (apply juxt match-keys)
        matching-set (->>  known-matches (map extract) set)]
    (contains? matching-set (extract match-target))))

(defn duplicates-of-exact-commit-matches-pred-from
  "Make a predicate that will return `true` on any coverage-data-def
  that matches at least one of the `exact-commit-match-defs` provided
  as an argument in terms of repo, repo-path and metric, with the
  exception of data defs that are perfect commit matches, since those
  are the ones we want to keep."
  [exact-commit-match-defs analyzed-commit]
  (let [matches-fields? (partial matches-known? grouping-keys exact-commit-match-defs)]
    (fn [ddef]
      (and (not= analyzed-commit (:commit-sha ddef))
           (matches-fields? ddef)))))


(defn- ensure-analyzed-commit-is-first
  "We want to prioritize code coverage data with a sha that exactly
  matches the analysis commit. However, there are lots of edge cases
  that make this tricky. We might have exact matches for some of the
  files we want to use, but we might also have newer files that are
  the only data source for other parts of the project. So if we only
  keep data defs up to the current commit, we might be excluding other
  files that we really want to keep. Hence the fairly complicated
  logic below. 

  1. If we don't have any matching commits, we just continue with the
  existing order.

  2. If we do have data defs with matching commits, we drop everything
  after those commits that also matches the repo, metric and repo-path
  of the data defs with exact commmit matches. But we keep any newer
  data defs that don't match by repo, metric or repo-path. And since
  there might be multiple exact matches, we make sure that they don't
  get eliminated during this filtering. (See
  `duplicates-of-exact-commit-matches-prred-from` for details on that.)"
  [analyzed-commit coverage-data-defs]
  (let [=analyzed-commit #(= analyzed-commit (:commit-sha %))
        exact-commit-match-defs  (filter =analyzed-commit coverage-data-defs)
        contains-analyzed-commit? (not-empty exact-commit-match-defs)]
    (if (not contains-analyzed-commit?)
      coverage-data-defs
      (let [reversed-defs (reverse coverage-data-defs)
            defs-after-commit-match (->> reversed-defs (take-while (complement =analyzed-commit)) reverse)
            defs-before-and-including-first-commit-match (->> reversed-defs (drop-while (complement =analyzed-commit)) reverse)
            duplicates-of-exact-commit-matches? (duplicates-of-exact-commit-matches-pred-from exact-commit-match-defs analyzed-commit)]
        (concat defs-before-and-including-first-commit-match
                (remove duplicates-of-exact-commit-matches? defs-after-commit-match))))))

(defn- malformed-path?
  [path]
  (let [preds-and-fail-msgs [[some? "Path must be truthy."]
                             [string? "Path must be a string."]
                             [#(str/ends-with? % ".edn") "Path must end with '.edn'."]]
        check  (some (fn [[p msg]] (when-not (p path) msg)) preds-and-fail-msgs)]
    (when check
      (format "Coverage path %s could not be used. %s" path check))))

(defn- log-and-swallow-fetch-exceptions
  "There's a possible race condition if (as we recommend) you run an
  analysis immediately after the upload. The housekeeping may take
  some time to complete, and since it runs in a future, it's possible
  that some of the data definitions sent to the analysis library are
  no longer valid. We need to be able to ignore these (ie. filter them
  out) without failing the entire analysis."
  [fetch-fn]
   (fn [repo path]
     (try
       (fetch-fn path)
       (catch Throwable _thr
         (log/infof "Failed to fetch coverage data. The coverage file might have been deleted by housekeeping after the current analysis was started. path=%s repo=%s" path repo)
         nil))))


(defn data-defs-for-priority-filtering
  "Fetches the coverage file and pre-computes the path-trees for each
  coverage data definition. Tries to avoid excessive memory use by not
  keeping the complete coverage data but keeps instead the more
  compact tree form. 

  Also filters out data definitions with invalid paths or when the
  data fetch fails.."
  [read-coverage-data-fn data-defs]
  (let [logging-fetch-fn (log-and-swallow-fetch-exceptions read-coverage-data-fn)]
    (map
     (fn [{:keys [data-path path-tree repo] :as ddef}]
       ;; path-tree will only be set in unit tests. This is a way to easily inject our own data.
       (if path-tree
         ddef
         (assoc ddef :path-tree
                (let [path-error-msg (malformed-path? data-path)
                      fetched (and (not path-error-msg) (logging-fetch-fn repo data-path))]
                  (cond
                    path-error-msg {:error path-error-msg}
                    (not fetched) {:error (format "Failure to fetch from correctly formed path: %s" data-path)}
                    :else (->> fetched (map :path) file-set-comparison/paths-to-tree))))))
     data-defs)))



(defn- validate-result
  [repo-path available-commit-shas applicable-commit-shas analyzed-commit]
  (let [applicable-commit-shas-set (set applicable-commit-shas)
        out-of-sync-applicable-commit-shas (remove #(= % analyzed-commit) applicable-commit-shas)]
    (cond
      (empty? available-commit-shas)
      {:level :info
       :description (format "No coverage data found for repository '%s'" repo-path)}

      (empty? applicable-commit-shas)
      {:level :warn
       :description (format "No applicable code coverage data was found for repo '%s'." repo-path)
       :remedy "Please try uploading more recent code coverage data that is a closer match to the branch targeted in the main analysis, then rerun the analysis."
       :message (format "No applicable code coverage data found for '%s'. For better results, upload recent code coverage data for the same branch as the main analysis." repo-path)}

      (not (contains? applicable-commit-shas-set analyzed-commit))
      {:level :warn
       :description (format "The latest code coverage data for repo '%s' is not a perfect match for the branch targeted in the main analysis. There may be some file differences."  repo-path)
       :remedy (format "For best results, upload code coverage for the currently analyzed branch on '%s' and rerun the analysis" repo-path)
       :message (format "The commit of the uploaded code coverage data does not match the current analysis branch for '%s'. Some code coverage results may be out of sync." repo-path)}

      (not-empty out-of-sync-applicable-commit-shas)
      {:level :warn
       :description (format "In repository '%s', %d out of %d uploads are behind the currently analyzed commit."
                            repo-path
                            (count out-of-sync-applicable-commit-shas)
                            (count applicable-commit-shas))
       :remedy (format "Upload code coverage for commit '%s' and rerun the analysis." analyzed-commit)
       :message (format "Showing partially stale code coverage data for '%s'. Please upload code coverage for the latest commit and rerun the analysis." repo-path)}

      :else
      {:level :info
       :description (format "Found coverage data matching the current branch for repo '%s'" repo-path)})))

(defn- commit-with-time
  [git-client repo-path commit-hash]
  (let [time-of (vcs-time-of-commit git-client repo-path commit-hash)]
    (-> time-of
     (assoc :commit commit-hash :commit-time (:commit-date time-of))
     (select-keys [:commit :commit-time]))))

(defn applicable-coverage-data-for*
  [{:keys [git-client repo repo-path read-coverage-data-fn analyzed-commit]} coverage-data-defs]
  (let [usable-coverage-data-defs
        (->> coverage-data-defs
             (sort-by :created-at)
             (ensure-analyzed-commit-is-first analyzed-commit)
             (data-defs-for-priority-filtering read-coverage-data-fn)
             (remove (fn [{:keys [path-tree]}] (:error path-tree)))
             faster-replace-superseded-data-defs
             (map
              (fn [{:keys [commit-sha] :as dd}]
                (assoc dd :files-changed (vcs-files-changed git-client repo-path commit-sha analyzed-commit)))))
        analyzed-commit-with-time (commit-with-time git-client repo-path analyzed-commit)]
    {:coverage-data-defs usable-coverage-data-defs
     :validation-result (validation-messaging/new-validate-result repo usable-coverage-data-defs analyzed-commit-with-time)}))

(defn applicable-coverage-data-for
  "Returns the data defs that are applicable to the specified repo-path"
  [{:keys [git-client repo-path] :as coverage-context} coverage-data-defs]
  ;; Make sure we use long shas when validating
  ;; (or rather, make sure we use the same type of shas for coverage data and analyzed commit)
  (let [->long-hash (partial long-hash-for git-client repo-path)]
    (->> coverage-data-defs
         (map #(update % :commit-sha ->long-hash))
         (applicable-coverage-data-for* (-> coverage-context
                                            (select-keys [:git-client :repo-path :repo :read-coverage-data-fn :analyzed-commit])
                                            (update :analyzed-commit ->long-hash))))))

(comment
  (applicable-coverage-data-for "git"
                                "/Users/kalle/codescene/dev/repos/95335ce6e7ee635b246ea47e4932390b9ad8e937/react"
                                "f9dddcbbb"
                                [{:repo "github.com/knorrest/reacts",
                                  :metric "statement-coverage"
                                  :commit-sha "f9dddcbbb"}
                                 {:repo "github.com/knorrest/react",
                                  :commit-sha "e150a3242"}
                                 {:repo "github.com/knorrest/react",
                                  :commit-sha "6ef0dd4f2"}]))
