#' Analyze Cumulative Network Groups Over Time
#'
#' Performs cumulative community detection on a network over specified time spans,
#' returning group statistics and keyword analysis for each time period.
#'
#' @param comps A list containing network components, typically generated by
#'   \code{\link{sniff_components}}(). Must include a network object with
#'   'component' and 'PY' (publication year) vertex attributes.
#' @param time_span Numeric vector of years to analyze (default: 2000:2024).
#' @param min_group_size Minimum size for a cluster to be retained (default = 10).
#' @param keep_component Character vector specifying which network components to
#'   process (default = "c1"). Can include multiple components.
#' @param cluster_component Character vector specifying which components should be
#'   clustered (default = "c1"). Components not listed here will be treated
#'   as single groups.
#' @param top_n_keywords Number of top keywords to extract per group (default = 10).
#' @param algorithm Community detection algorithm to use. One of:
#'   \code{"louvain"}, \code{"walktrap"}, \code{"edge_betweenness"},
#'   \code{"fast_greedy"} (default), or \code{"leiden"}.
#' @param seed Random seed for reproducible results (default = 888L). Only applies
#'   to algorithms that use random initialization like Louvain.
#'
#' @return A named list (by year) where each element contains:
#' \describe{
#'   \item{groups}{A tibble with group statistics and top keywords}
#'   \item{documents}{A tibble mapping documents to groups}
#'   \item{network}{The cumulative network up to that year}
#' }
#'
#' @examples
#' \dontrun{
#' # Typical pipeline:
#' data <- read_wos("savedrecs.txt")
#' net <- sniff_network(data)
#' comps <- sniff_components(net)
#'
#' # Cumulative analysis
#' groups_cumulative <- sniff_groups_cumulative(
#'   comps,
#'   time_span = 2010:2020,
#'   keep_component = c("c1", "c2"),
#'   cluster_component = c("c1"),
#'   algorithm = "leiden",
#'   seed = 888L
#' )
#'
#' # Access results for 2015
#' groups_cumulative[["network_until_2015"]]$groups
#' }
#'
#' @importFrom purrr map map_int
#' @importFrom igraph vcount V cluster_louvain cluster_walktrap cluster_edge_betweenness
#' @importFrom igraph cluster_fast_greedy cluster_leiden as_undirected vertex_attr_names
#' @importFrom tidygraph as_tbl_graph activate
#' @importFrom dplyr filter group_by summarise arrange mutate n desc select relocate left_join tally slice_head
#' @importFrom tidyr separate_rows
#' @importFrom stringr str_trim
#' @export
sniff_groups_cumulative <- function(
  comps,
  time_span = NULL,
  min_group_size = 10,
  keep_component = c("c1"),
  cluster_component = c("c1"),
  top_n_keywords = 10,
  algorithm = "fast_greedy",
  seed = 888L) {

  # Input validation
  if (!is.list(comps)) {
    stop("Input must be a list generated by sniff_components() function", call. = FALSE)
  }

  net <- comps$network

  if (!inherits(net, c("tbl_graph", "igraph"))) {
    stop("Input must be a network object (tbl_graph or igraph)", call. = FALSE)
  }

  if (!"component" %in% igraph::vertex_attr_names(net)) {
    stop("Network must contain 'component' vertex attribute, generated by sniff_components()", call. = FALSE)
  }

  if (!"PY" %in% igraph::vertex_attr_names(net)) {
    stop("Network must contain 'PY' (publication year) vertex attribute, generated by sniff_components()", call. = FALSE)
  }

  if (!is.numeric(min_group_size) || min_group_size < 1) {
    stop("min_group_size must be a positive integer", call. = FALSE)
  }

  valid_algorithms <- c("louvain", "walktrap", "edge_betweenness", "fast_greedy", "leiden")
  if (!algorithm %in% valid_algorithms) {
    stop("algorithm must be one of: ", paste(valid_algorithms, collapse = ", "), call. = FALSE)
  }

  if (!is.numeric(seed) && !is.integer(seed)) {
    stop("seed must be a numeric or integer value", call. = FALSE)
  }

  if (!(is.numeric(time_span) || is.null(time_span))) {
    stop("time_span must be a numeric vector with years, or leave it as NULL to use all the data", call. = FALSE)
  }

  # Initialize variables
  component <- quantity_papers <- group <- group_new <- name <- NULL

  if (is.null(time_span)) {
    time_span <- min(igraph::V(net)$PY):max(igraph::V(net)$PY)
  }

  tryCatch(
    {
      # Filter network to keep only specified components
      net |>
        tidygraph::as_tbl_graph() |>
        tidygraph::activate(nodes) |>
        dplyr::mutate(PY = as.numeric(PY)) |>
        dplyr::filter(.data$component %in% keep_component) ->
        net_filtered

      # Create cumulative networks for each time point
      purrr::map(time_span, ~ {
        net_filtered |>
          tidygraph::as_tbl_graph() |>
          tidygraph::activate(nodes) |>
          dplyr::filter(.data$PY <= .x)
      }) ->
        netl

      # Check for empty networks
      n_nodes <- purrr::map_int(netl, igraph::vcount)
      if (any(n_nodes == 0)) {
        empty_years <- time_span[n_nodes == 0]
        stop("No articles found in years: ", paste(empty_years, collapse = ", "), call. = FALSE)
      }

      # Process each time point
      purrr::map(time_span, function(year) {
        idx <- which(time_span == year)
        current_net <- netl[[idx]]

        # Split components into separate networks
        purrr::map(keep_component, ~ {
          current_net |>
            tidygraph::as_tbl_graph() |>
            tidygraph::activate(nodes) |>
            dplyr::filter(.data$component == .x)
        }) |>
          purrr::set_names(keep_component) ->
          comp

        # Separate components to cluster vs not to cluster
        no_cluster <- comp[setdiff(keep_component, cluster_component)]
        to_cluster <- comp[intersect(keep_component, cluster_component)]

        # Process components to be clustered
        clustered_results <- purrr::map(to_cluster, function(component_net) {
          
          eb <- switch(algorithm,
            "louvain" = {
              set.seed(seed)
              igraph::cluster_louvain(igraph::as_undirected(component_net))
            },
            "walktrap" = igraph::cluster_walktrap(component_net),
            "edge_betweenness" = igraph::cluster_edge_betweenness(component_net),
            "fast_greedy" = igraph::cluster_fast_greedy(igraph::as_undirected(component_net)),
            "leiden" = {
              set.seed(seed)
              igraph::cluster_leiden(igraph::as_undirected(component_net))
            }
          )

          # Add group membership to vertices
          igraph::V(component_net)$group <- eb$membership

          # Create group summary statistics
          component_net |>
            tidygraph::as_tbl_graph() |>
            tidygraph::activate(nodes) |>
            dplyr::as_tibble() |>
            dplyr::group_by(.data$group) |>
            dplyr::summarise(
              quantity_papers = dplyr::n(),
              average_age = mean(.data$PY, na.rm = TRUE),
              component = unique(.data$component),
              .groups = "drop"
            ) |>
            dplyr::arrange(.data$component, dplyr::desc(.data$quantity_papers)) |>
            dplyr::mutate(
              group_old = .data$group, # Keep original group ID
              group_new = paste(.data$component, "g", 1:dplyr::n(), sep = "")
            ) |>
            dplyr::filter(.data$quantity_papers >= min_group_size) ->
            group_stats_with_mapping

          # Create lookup table for group renaming
          group_stats_with_mapping |>
            dplyr::select(group_old, group_new) ->
            group_lookup

          # Create node-group mapping using the lookup table
          component_net |>
            tidygraph::as_tbl_graph() |>
            tidygraph::activate(nodes) |>
            dplyr::as_tibble() |>
            dplyr::left_join(group_lookup, by = c("group" = "group_old")) |>
            dplyr::filter(!is.na(.data$group_new)) |> # Keep only groups that passed min_size
            dplyr::select(.data$name, group = .data$group_new, .data$DE) ->
            node_groups

          # Final group stats without the mapping column
          group_stats_with_mapping |>
            dplyr::select(group = .data$group_new, .data$quantity_papers, .data$average_age) ->
            group_stats

          list(aggregate = group_stats, ids = node_groups)
        })

        # Process non-clustered components
        if (length(no_cluster) != 0) {
          non_clustered_results <- purrr::map(no_cluster, function(component_net) {
            component_net |>
              tidygraph::as_tbl_graph() |>
              tidygraph::activate(nodes) |>
              dplyr::as_tibble() |>
              dplyr::group_by(.data$component) |>
              dplyr::summarise(
                quantity_papers = dplyr::n(),
                average_age = mean(.data$PY, na.rm = TRUE),
                .groups = "drop"
              ) |>
              dplyr::select(group = .data$component, .data$quantity_papers, .data$average_age) ->
              group_stats

            component_net |>
              tidygraph::as_tbl_graph() |>
              tidygraph::activate(nodes) |>
              dplyr::as_tibble() |>
              dplyr::select(.data$name, group = .data$component, .data$DE) ->
              node_groups

            list(aggregate = group_stats, ids = node_groups)
          })

          # Combine all results
          all_results <- c(clustered_results, non_clustered_results)
        } else {
          all_results <- clustered_results
        }

        # Create final outputs for this year
        aggregates <- purrr::map_dfr(all_results, "aggregate")
        doc_groups <- purrr::map_dfr(all_results, "ids")
        
        # Add network_until year to both dataframes
        aggregates <- aggregates |> dplyr::mutate(network_until = year)
        doc_groups <- doc_groups |> dplyr::mutate(network_until = year)

        # Extract top keywords for each group
        if (!is.null(doc_groups$DE)) {
          group_keywords <- doc_groups |>
            tidyr::separate_rows(.data$DE, sep = ";") |>
            dplyr::mutate(DE = stringr::str_trim(.data$DE)) |>
            dplyr::filter(!is.na(.data$DE) & .data$DE != "") |>
            dplyr::group_by(.data$group, .data$DE) |>
            dplyr::tally(name = "qtde") |>
            dplyr::arrange(.data$group, dplyr::desc(.data$qtde)) |>
            dplyr::group_by(.data$group) |>
            dplyr::slice_head(n = top_n_keywords) |>
            dplyr::mutate(keywords_freq = paste0(.data$DE, " (", .data$qtde, ")")) |>
            dplyr::summarise(
              keywords = paste(.data$keywords_freq, collapse = "; "),
              .groups = "drop"
            )
        } else {
          group_keywords <- data.frame(group = character(), keywords = character())
        }

        # Combine group stats with keywords
        final_groups <- dplyr::left_join(aggregates, group_keywords, by = "group") |>
          dplyr::relocate(.data$keywords, .after = .data$network_until)

        list(groups = final_groups, documents = doc_groups, network = current_net)
      }) ->
        results

      # Name the results by year
      names(results) <- paste("network_until", time_span, sep = "_")
      return(results)
    },
    error = function(e) {
      stop("Error in cumulative group analysis: ", e$message, call. = FALSE)
    }
  )
}
