diff --git a/gerrychain/accept.py b/gerrychain/accept.py index 92d3b5c8..5463f13c 100644 --- a/gerrychain/accept.py +++ b/gerrychain/accept.py @@ -21,6 +21,8 @@ def cut_edge_accept(partition: Partition) -> bool: Always accepts the flip if the number of cut_edges increases. Otherwise, uses the Metropolis criterion to decide. + frm: TODO: Add documentation on what the "Metropolis criterion" is... + :param partition: The current partition to accept a flip from. :type partition: Partition diff --git a/gerrychain/constraints/contiguity.py b/gerrychain/constraints/contiguity.py index e1077e4a..81674b84 100644 --- a/gerrychain/constraints/contiguity.py +++ b/gerrychain/constraints/contiguity.py @@ -1,67 +1,122 @@ from heapq import heappop, heappush from itertools import count -import networkx as nx from typing import Callable, Any, Dict, Set from ..partition import Partition import random from .bounds import SelfConfiguringLowerBound +from ..graph import Graph -def are_reachable(G: nx.Graph, source: Any, avoid: Callable, targets: Any) -> bool: +# frm TODO: Remove this comment about NX dependencies (once we are all set with the work) +# +# NX dependencies: +# def _are_reachable(G: nx.Graph, ...) +# nx.is_connected(partition.subgraphs[part]) for part in _affected_parts(partition) +# adj = nx.to_dict_of_lists(partition.subgraphs[part]) +# + +# frm: TODO: Think about the efficiency of the routines in this module. Almost all +# of these involve traversing the entire graph, and I fear that callers +# might make multiple calls. +# +# Possible solutions are to 1) speed up these routines somehow and 2) cache +# results so that at least we don't do the traversals over and over. + +def _are_reachable(graph: Graph, start_node: Any, avoid: Callable, targets: Any) -> bool: """ A modified version of NetworkX's function `networkx.algorithms.shortest_paths.weighted._dijkstra_multisource()` - This function checks if the targets are reachable from the source node + This function checks if the targets are reachable from the start_node node while avoiding edges based on the avoid condition function. - :param G: The networkx graph - :type G: nx.Graph - :param source: The starting node - :type source: int + :param graph: Graph + :type graph: Graph + :param start_node: The starting node + :type start_node: int :param avoid: The function that determines if an edge should be avoided. It should take in three parameters: the start node, the end node, and the edges to avoid. It should return True if the edge should be avoided, False otherwise. + # frm: TODO: Fix the comment above about the "avoid" function parameter. + # It may have once been accurate, but the original code below + # passed parameters to it of (node_id, neighbor_node_id, edge_data_dict) + # from NetworkX.Graph._succ So, "the edges to avoid" above is wrong. + # This whole issue is moot, however, since the only routine + # that is used as an avoid function ignores the third parameter. + # Or rather it used to avoid the third parameter, but it has + # been updated to only take two parameters, and the code below + # has been modified to use Graph.neighbors() instead of _succ + # because 1) we can't use NX and 2) because we don't need the + # edge data dictionary anyways... + # :type avoid: Callable :param targets: The target nodes that we would like to reach :type targets: Any - :returns: True if all of the targets are reachable from the source node + :returns: True if all of the targets are reachable from the start_node node under the avoid condition, False otherwise. :rtype: bool """ - G_succ = G._succ if G.is_directed() else G._adj - push = heappush pop = heappop - dist = {} # dictionary of final distances + node_distances = {} # dictionary of final distances seen = {} # fringe is heapq with 3-tuples (distance,c,node) # use the count c to avoid comparing nodes (may not be able to) c = count() fringe = [] - seen[source] = 0 - push(fringe, (0, next(c), source)) - - while not all(t in seen for t in targets) and fringe: - (d, _, v) = pop(fringe) - if v in dist: + seen[start_node] = 0 + push(fringe, (0, next(c), start_node)) + + + # frm: Original Code: + # + # while not all(t in seen for t in targets) and fringe: + # (d, _, v) = pop(fringe) + # if v in dist: + # continue # already searched this node. + # dist[v] = d + # for u, e in G_succ[v].items(): + # if avoid(v, u, e): + # continue + # + # vu_dist = dist[v] + 1 + # if u not in seen or vu_dist < seen[u]: + # seen[u] = vu_dist + # push(fringe, (vu_dist, next(c), u)) + # + # return all(t in seen for t in targets) + # + + + + # While we have not yet seen all of our targets and while there is + # still some fringe... + while not all(tgt in seen for tgt in targets) and fringe: + (distance, _, node_id) = pop(fringe) + if node_id in node_distances: continue # already searched this node. - dist[v] = d - for u, e in G_succ[v].items(): - if avoid(v, u, e): + node_distances[node_id] = distance + + dbg_neighbors = graph.neighbors(node_id) + + for neighbor in graph.neighbors(node_id): + if avoid(node_id, neighbor): continue - vu_dist = dist[v] + 1 - if u not in seen or vu_dist < seen[u]: - seen[u] = vu_dist - push(fringe, (vu_dist, next(c), u)) + neighbor_distance = node_distances[node_id] + 1 + if neighbor not in seen or neighbor_distance < seen[neighbor]: + seen[neighbor] = neighbor_distance + push(fringe, (neighbor_distance, next(c), neighbor)) - return all(t in seen for t in targets) + # frm: TODO: Simplify this code. It computes distances and counts but + # never uses them. These must be relics of code copied + # from somewhere else where it had more uses... + return all(tgt in seen for tgt in targets) def single_flip_contiguous(partition: Partition) -> bool: """ @@ -87,7 +142,7 @@ def single_flip_contiguous(partition: Partition) -> bool: graph = partition.graph assignment = partition.assignment - def partition_edge_avoid(start_node: Any, end_node: Any, edge_attrs: Dict): + def _partition_edge_avoid(start_node: Any, end_node: Any): """ Helper function used in the graph traversal to avoid edges that cross between different assignments. It's crucial for ensuring that the traversal only considers paths within @@ -98,7 +153,7 @@ def partition_edge_avoid(start_node: Any, end_node: Any, edge_attrs: Dict): :param end_node: The end node of the edge. :type end_node: Any :param edge_attrs: The attributes of the edge (not used in this function). Needed - because this function is passed to :func:`are_reachable`, which expects the + because this function is passed to :func:`_are_reachable`, which expects the avoid function to have this signature. :type edge_attrs: Dict @@ -126,8 +181,10 @@ def partition_edge_avoid(start_node: Any, end_node: Any, edge_attrs: Dict): start_neighbor = random.choice(old_neighbors) # Check if all old neighbors in the same assignment are still reachable. - connected = are_reachable( - graph, start_neighbor, partition_edge_avoid, old_neighbors + # The "_partition_edge_avoid" function will prevent searching across + # a part (district) boundary + connected = _are_reachable( + graph, start_neighbor, _partition_edge_avoid, old_neighbors ) if not connected: @@ -138,7 +195,7 @@ def partition_edge_avoid(start_node: Any, end_node: Any, edge_attrs: Dict): return True -def affected_parts(partition: Partition) -> Set[int]: +def _affected_parts(partition: Partition) -> Set[int]: """ Checks which partitions were affected by the change of nodes. @@ -168,7 +225,7 @@ def affected_parts(partition: Partition) -> Set[int]: def contiguous(partition: Partition) -> bool: """ - Check if the parts of a partition are connected using :func:`networkx.is_connected`. + Check if the parts of a partition are connected :param partition: The proposed next :class:`~gerrychain.partition.Partition` :type partition: Partition @@ -176,11 +233,16 @@ def contiguous(partition: Partition) -> bool: :returns: Whether the partition is contiguous :rtype: bool """ + # frm: Original code: + # + # return all( + # nx.is_connected(partition.subgraphs[part]) for part in _affected_parts(partition) + # ) + return all( - nx.is_connected(partition.subgraphs[part]) for part in affected_parts(partition) + is_connected_bfs(partition.subgraphs[part]) for part in _affected_parts(partition) ) - def contiguous_bfs(partition: Partition) -> bool: """ Checks that a given partition's parts are connected as graphs using a simple @@ -192,17 +254,31 @@ def contiguous_bfs(partition: Partition) -> bool: :returns: Whether the parts of this partition are connected :rtype: bool """ - parts_to_check = affected_parts(partition) - - # Generates a subgraph for each district and perform a BFS on it - # to check connectedness. - for part in parts_to_check: - adj = nx.to_dict_of_lists(partition.subgraphs[part]) - if _bfs(adj) is False: - return False - - return True - + + # frm: TODO: Try to figure out why this routine exists. It seems to be + # exactly the same conceptually as contiguous(). It looks + # at the "affected" parts - those that have changed node + # assignments from parent, and sees if those parts are + # contiguous. + # + # For now, I have just replaced the existing code which depended + # on NX with a call on contiguous(partition). + # + + # frm: Original Code: + # + # parts_to_check = _affected_parts(partition) + # + # # Generates a subgraph for each district and perform a BFS on it + # # to check connectedness. + # for part in parts_to_check: + # adj = nx.to_dict_of_lists(partition.subgraphs[part]) + # if _bfs(adj) is False: + # return False + # + # return True + + return contiguous(partition) def number_of_contiguous_parts(partition: Partition) -> int: """ @@ -212,8 +288,12 @@ def number_of_contiguous_parts(partition: Partition) -> int: :returns: Number of contiguous parts in the partition. :rtype: int """ + # frm: Original Code: + # parts = partition.assignment.parts + # return sum(1 for part in parts if nx.is_connected(partition.subgraphs[part])) + # parts = partition.assignment.parts - return sum(1 for part in parts if nx.is_connected(partition.subgraphs[part])) + return sum(1 for part in parts if is_connected_bfs(partition.subgraphs[part])) # Create an instance of SelfConfiguringLowerBound using the number_of_contiguous_parts function. @@ -235,11 +315,37 @@ def contiguous_components(partition: Partition) -> Dict[int, list]: subgraphs of that part of the partition :rtype: dict """ - return { - part: [subgraph.subgraph(nodes) for nodes in nx.connected_components(subgraph)] - for part, subgraph in partition.subgraphs.items() - } + # frm: TODO: NX vs RX Issues here: + # + # The call on subgraph() below is perhaps problematic because it will renumber + # node_ids... + # + # The issue is not that the code is incorrect (with RX there is really no other + # option), but rather that any legacy code will be unprepared to deal with the fact + # that the subgraphs returned are (I think) three node translations away from the + # original NX-Graph object's node_ids. + # + # Translations: + # + # 1) From NX to RX when partition was created + # 2) From top-level RX graph to the partition's subgraphs for each part (district) + # 3) From each part's subgraph to the subgraphs of contiguous_components... + # + + # frm: Original Code: + # return { + # part: [subgraph.subgraph(nodes) for nodes in nx.connected_components(subgraph)] + # for part, subgraph in partition.subgraphs.items() + # } + # + connected_components_in_each_partition = {} + for part, subgraph in partition.subgraphs.items(): + # create a subgraph for each set of connected nodes in the part's nodes + list_of_connected_subgraphs = subgraph.subgraphs_for_connected_components() + connected_components_in_each_partition[part] = list_of_connected_subgraphs + + return connected_components_in_each_partition def _bfs(graph: Dict[int, list]) -> bool: """ @@ -254,6 +360,7 @@ def _bfs(graph: Dict[int, list]) -> bool: """ q = [next(iter(graph))] visited = set() + # frm TODO: Make sure len() is defined on Graph object... total_vertices = len(graph) # Check if the district has a single vertex. If it does, then simply return @@ -272,3 +379,26 @@ def _bfs(graph: Dict[int, list]) -> bool: q += [neighbor] return total_vertices == len(visited) + +# frm: TODO: Verify that is_connected_bfs() works - add a test or two... + +# frm: Code obtained from the web - probably could be optimized... +# This code replaced calls on nx.is_connected() +def is_connected_bfs(graph: Graph): + if not graph: + return True + + nodes = list(graph.node_indices) + + start_node = random.choice(nodes) + visited = {start_node} + queue = [start_node] + + while queue: + current_node = queue.pop(0) + for neighbor in graph.neighbors(current_node): + if neighbor not in visited: + visited.add(neighbor) + queue.append(neighbor) + + return len(visited) == len(nodes) diff --git a/gerrychain/graph/graph.py b/gerrychain/graph/graph.py index fdd905a8..ba3ac2ba 100644 --- a/gerrychain/graph/graph.py +++ b/gerrychain/graph/graph.py @@ -21,10 +21,107 @@ from networkx.readwrite import json_graph import pandas as pd +# frm: added to support RustworkX graphs (in the future) +import rustworkx + from .adjacency import neighbors from .geo import GeometryError, invalid_geometries, reprojected from typing import List, Iterable, Optional, Set, Tuple, Union +import geopandas as gp +from shapely.ops import unary_union +from shapely.prepared import prep + +import numpy as np +from scipy.sparse import csr_array + + +######################################################### +# frm Overview of changes (May 2025): +""" +This comment is temporary - it describes the work done to encapsulate dependency on +NetworkX so that this file is the only file that has any NetworkX dependencies. +That work is not completely done - there are bits and bobs of NetworkX +dependencies outside this file, but they are at least commented. In short, +this comment attempts to make clear what I am trying to do. + +The idea is to replace the old Graph object (that was a subclass of NetworkX.Graph) with +a new Graph object that is not a subclass of anything. This new Graph class would look +and act like the old NetworkX based Graph object. Under the covers it would have +either an NX Graph or an RX PyGraph. + +There is a legitimate question - why bother to retain the option to use a NetworkX Graph +as the underlying Graph object, if the user cannot know what the underlying graph object +is? There are two answers: + 1) It seemed possible to me that users took advantage in their own code that the + Graph object was in fact a NetworkX Graph object. If that is so, then we can + make life easier for them by providing them an easy way to gain access to the + internal NetworkX Graph object so they can continue to use that code. + 2) It was a convenient way to evolve the code - I could make changes, but still + have some old NX code that I could use as short-term hacks. It allowed me to + use a regression test to make sure that it all still ran - some of it running + with the new Graph object and some of it hacked to operate on the underlying + NX Graph data member. + +In the future, if #1 is not an issue, we can just expunge NetworkX completely. + +I noticed that the FrozenGraph class had already implemented the behavior of the Graph +class but without being a subclass of the NetworkX Graph object. So, my new Graph class +was based in large part on the FrozenGraph code. It helped me grok property decorators +and __getattr__ and __getattribute__ - interesting Pythonic stuff! + +It is not the case that ALL of the behavior of the NX based Graph class is replicated +in the new Graph class - I have not implemented NodeView and EdgeView functionality +and maybe I will not have to. + +Note that one of the biggest differences between NetworkX and RustworkX is in how nodes +and edges are identified. In NetworkX there is not really a difference between the "index" +of a node or an edge and its "name" or "ID" or "value". In NetworkX the way you index into nodes +and edges is by using the node's name/Id or the edge's tuple - in effect the index and the +name/ID/value are the same. However, in RustworkX, the index is always an integer, and furthermore +the set of indexes for both nodes and edges stats at zero with consecutive integer values. This +is one of the things that allows RustworkX to be faster than NetworkX. Converting to using +RustworkX, therefore, required that the code distinguish between a node/edge's index and its value/ID/name. +This is most visible in the use of node_data() and edge_data() functions and +in the changes made to the use of subgraphs (which unfortunately have different index values for nodes than +the parent graph in RX). + +A note on subgraphs: Creating subgraphs is a fundamental operation for GerryChain. When using NX, +a subgraph's node (and also edge) indexes were unchanged from the parent's, so it was safe to do +calculations on a subgraph and pass back node information (like flips). However, when using RX, the +node and edge indexes change, so in order to pass back information in the parent's index systems, +the subgraph's nodes and edges need to be translated back into those of the parent's index system. +In order to do this, every graph contains two new bits of information 1) whether it is a subgraph and +2) a mapping from the subgraph index values to those of its parent. For top-level (non subgraphs), this +mapping is just an identity mapping - this is just a convenience so that routines can always use the +map without having to worry about whether it is a subgraph or not. + +The current state of affairs (early May 2025) is that the code in tree.py has mostly +been converted to use the new Graph object instead of nx.Graph, and that the regression +test works (which only tests some of the functionality, but it does run a chain...) + +I have left the original code for the old Graph object in the file so that I could test +that the original and the new code behave the same way - see tests/frm_tests/test_frm_old_vs_new_graph.py +These frm_tests are not yet configured to run as pytest tests, but they soon will be. +I will add additional tests here over time. + +Most of the NetworkX dependencies that remain are on NX algorithms (like is_connected() and +laplacian_matrix()). These need to be replaced with functions that work on RustworkX Graphs. +I have not yet determined whether they all need to work on both NX and RX graphs - if they +only ever need to work on graphs inside Paritions, then they only need to work for RX, but +it may be convenient to have them work both ways - needs some thought, and it might be easier +to just provide compatibility to cover any edge case that I can't think of... + +After getting rid of all NX dependencies outside this file, it will be time to switch to +RX which will involve: + + 1) Creating RX versions of NX functionality - such as laplacian_matrix(). There are + lots of comments in the code saying: # frm TODO: RX version NYI... + + 2) Adding code so that when we "freeze" a graph, we also convert it to RX. + +""" +######################################################### def json_serialize(input_object: Any) -> Optional[int]: """ @@ -47,18 +144,1301 @@ def json_serialize(input_object: Any) -> Optional[int]: return None +class Graph: + """ + frm TODO: Clean up this documentation + + frm: this class encapsulates / hides the underlying graph which can either be a + NetworkX graph or a RustworkX graph. The intent is that it provides the same + external interface as a NetworkX graph (for all of the uses that GerryChain cares + about, at least) so that legacy code that operated on NetworkX based Graph objects + can continue to work unchanged. + + When a graph is added to a partition, however, the NX graph will be converted into + an RX graph and the NX graph will become unaccessible to the user. The RX graph + may also be "frozen" the way the NX graph was "frozen" in the legacy code, but we + have not yet gotten that far in the implementation. + + It is not clear whether the code that does the heavy lifting on partitions will + need to use the old NX syntax or whether it will be useful to allow unfettered + access to the RX graph so that RX code can be used in these modules. TBD... + -class Graph(networkx.Graph): """ - Represents a graph to be partitioned, extending the :class:`networkx.Graph`. - This class includes additional class methods for constructing graphs from shapefiles, - and for saving and loading graphs in JSON format. + # frm: This class cannot have a constructor - because there is code that assumes + # that it can use the default constructor to create instances of it. + # That code is buried deep in non GerryChain code, so I don't really understand + # what it is doing, but the assignment of nx_graph and rx_graph class attributes/members + # needs to happen in the "from_xxx()" routines. + # + # def __init__(self, nx_graph: networkx.Graph, rx_graph: rustworkx.PyGraph) -> None: + # # frm TODO: check that exactly one param is not None - need one and only one graph... + # self._nx_graph = nx_graph + # self._rx_graph = rx_graph + + # frm: TODO: Add documentation for new data members I am adding: + # _nx_graph, _rx_graph, _node_id_to_parent_node_id_map, _is_a_subgraph + + @classmethod + def from_networkx(cls, nx_graph: networkx.Graph) -> "Graph": + graph = cls() + graph._nx_graph = nx_graph + graph._rx_graph = None + graph._is_a_subgraph = False # See comments on RX subgraph issues. + # Maps node_ids in the graph to the "parent" node_ids in the parent graph. + # For top-level graphs, this is just an identity map + graph._node_id_to_parent_node_id_map = {node: node for node in graph.node_indices} + # Maps node_ids in the graph to the "original" node_ids in parent graph. + # For top-level graphs, this is just an identity map + graph._node_id_to_original_node_id_map = {node: node for node in graph.node_indices} + graph.nx_to_rx_node_id_map = None # only set when an NX based graph is converted to be an RX based graph + return graph + + @classmethod + def from_rustworkx(cls, rx_graph: rustworkx.PyGraph) -> "Graph": + graph = cls() + graph._rx_graph = rx_graph + graph._nx_graph = None + graph._is_a_subgraph = False # See comments on RX subgraph issues. + # Maps node_ids in the graph to the "parent" node_ids in the parent graph. + # For top-level graphs, this is just an identity map + graph._node_id_to_parent_node_id_map = {node: node for node in graph.node_indices} + # Maps node_ids in the graph to the "original" node_ids in parent graph. + # For top-level graphs, this is just an identity map + graph._node_id_to_original_node_id_map = {node: node for node in graph.node_indices} + graph.nx_to_rx_node_id_map = None # only set when an NX based graph is converted to be an RX based graph + return graph + + # frm: TODO: Create a test for this routine + def original_node_ids_for_set(self, set_of_nodes): + # Utility routine to quickly translate a set of node_ids to their original node_ids + _node_id_to_original_node_id_map = self._node_id_to_original_node_id_map + new_set = {_node_id_to_original_node_id_map[node_id] for node_id in set_of_nodes} + return new_set + + # frm: TODO: Create a test for this routine + def original_node_ids_for_list(self, list_of_nodes): + # Utility routine to quickly translate a set of node_ids to their original node_ids + _node_id_to_original_node_id_map = self._node_id_to_original_node_id_map + new_list = [_node_id_to_original_node_id_map[node_id] for node_id in set_of_nodes] + return new_list + + def original_node_id_for_internal_node_id(self, internal_node_id): + return self._node_id_to_original_node_id_map[internal_node_id] + + # frm: TODO: Create a test for this routine + def internal_node_id_for_original_node_id(self, original_node_id): + # frm: TODO: Think about a better way to map original_node_ids to internal node_ids + # + # The problem is that when this routine is called, it may often be called repeatedly + # for a list of nodes, and we create the reverse dict every time this is called which + # is needlessly expensive. We could just cache this reverse map, but that is often + # dangerous because we have two sources of truth and if someone needs to update one + # they may forget to update the other... + + # reverse the map so we can go from original node_id to internal node_id + orignal_node_id_to_internal_node_id_map = { + v: k for k,v in self._node_id_to_original_node_id_map.items() + } + return orignal_node_id_to_internal_node_id_map[original_node_id] + + def verify_graph_is_valid(self): + + # Sanity check - this is where to add additional sanity checks in the future. + + # Checks that there is one and only one graph + if not ( + (self._nx_graph is not None and self._rx_graph is None) + or (self._nx_graph is None and self._rx_graph is not None) + ): + raise Exception("Graph.verify_graph_is_valid - graph not properly configured") + + def is_nx_graph(self): + self.verify_graph_is_valid() + return self._nx_graph is not None + + def get_nx_graph(self): + if not self.is_nx_graph(): + raise Exception("get_nx_graph - graph is not an NX version of Graph") + return self._nx_graph + + def get_rx_graph(self): + if not self.is_rx_graph(): + raise Exception("get_rx_graph - graph is not an RX version of Graph") + return self._rx_graph + + def is_rx_graph(self): + self.verify_graph_is_valid() + return self._rx_graph is not None + + def convert_from_nx_to_rx(self) -> "Graph": + # Return a Graph object which has a RustworkX Graph object as its + # embedded graph object. + # + # Note that in both cases in the if-stmt below, the nodes are not copied. + # This is arguably dangerous, but in our case I think it is OK. Stated + # differently, the actual node data (the dictionaries) in the original + # graph (self) will be reused in the returned graph - either because we + # are just returning the same graph (if it is already based on rx.PyGraph) + # or if we are converting it from NX. + # + self.verify_graph_is_valid() + if self.is_nx_graph(): + rx_graph = rustworkx.networkx_converter(self._nx_graph, keep_attributes=True) + + converted_graph = Graph.from_rustworkx(rx_graph) + + # Create a mapping from the old NX node_ids to the new RX node_ids (created by + # RX when it converts from NX) + nx_to_rx_node_id_map = { + converted_graph.node_data(node_id)["__networkx_node__"]: node_id + for node_id in converted_graph._rx_graph.node_indices() + } + converted_graph._nx_to_rx_node_id_map = nx_to_rx_node_id_map + + return converted_graph + elif self.is_rx_graph(): + return self + else: + raise Exception("convert_from_nx_to_rx: Bad kind of Graph object") + + def get_nx_to_rx_node_id_map(self): + # Simple getter method + if not self.is_rx_graph(): + raise Exception("get_nx_to_rx_node_id_map: Graph is not an RX based Graph") + + return self._nx_to_rx_node_id_map + + @classmethod + def from_json(cls, json_file: str) -> "Graph": + # frm TODO: Do we want to be able to go from JSON directly to RX? + # + # Peter said that this is not a priority - that we only need RX after + # creating a partition, but maybe in the future if we decide to + # encourage an all RX world... + # + + with open(json_file) as f: + data = json.load(f) + # frm: A bit of Python magic - an adjacency graph is a dict of dict of dicts + # which is structurally equivalent to a NetworkX graph, so you can just + # pretend that is what it is and it all works. + nx_graph = json_graph.adjacency_graph(data) + graph = cls.from_networkx(nx_graph) + graph.issue_warnings() + return graph + + def to_json(self, json_file: str, include_geometries_as_geojson: bool = False) -> None: + # frm TODO: Implement this for an RX based graph + if not self.is_nx_graph(): + raise Exception("At present, can only create JSON for NetworkX graph") + + data = json_graph.adjacency_data(self._nx_graph) + + if include_geometries_as_geojson: + convert_geometries_to_geojson(data) + else: + remove_geometries(data) + + with open(json_file, "w") as f: + json.dump(data, f, default=json_serialize) + + @classmethod + def from_file( + cls, + filename: str, + adjacency: str = "rook", + cols_to_add: Optional[List[str]] = None, + reproject: bool = False, + ignore_errors: bool = False, + ) -> "Graph": + """ + Create a :class:`Graph` from a shapefile (or GeoPackage, or GeoJSON, or + any other library that :mod:`geopandas` can read. See :meth:`from_geodataframe` + for more details. + + :param filename: Path to the shapefile / GeoPackage / GeoJSON / etc. + :type filename: str + :param adjacency: The adjacency type to use ("rook" or "queen"). Default is "rook" + :type adjacency: str, optional + :param cols_to_add: The names of the columns that you want to + add to the graph as node attributes. Default is None. + :type cols_to_add: Optional[List[str]], optional + :param reproject: Whether to reproject to a UTM projection before + creating the graph. Default is False. + :type reproject: bool, optional + :param ignore_errors: Whether to ignore all invalid geometries and try to continue + creating the graph. Default is False. + :type ignore_errors: bool, optional + + :returns: The Graph object of the geometries from `filename`. + :rtype: Graph + + .. Warning:: + + This method requires the optional ``geopandas`` dependency. + So please install ``gerrychain`` with the ``geo`` extra + via the command: + + .. code-block:: console + + pip install gerrychain[geo] + + or install ``geopandas`` separately. + """ + + df = gp.read_file(filename) + graph = cls.from_geodataframe( + df, + adjacency=adjacency, + cols_to_add=cols_to_add, + reproject=reproject, + ignore_errors=ignore_errors, + ) + # frm: TODO: Need to make sure this works for RX also + # To do so, need to find out how CRS data is used + # and whether it is used externally or only internally... + # + # Note that the NetworkX.Graph.graph["crs"] is only + # ever accessed in this file (graph.py), so I am not + # clear what it is used for. It seems to just be set + # and never used except to be written back out to JSON. + # + # The issue (I think) is that we do not preserve graph + # attributes when we convert to RX from NX, so if the + # user wants to write an RX based Graph back out to JSON + # this data (and another other graph level data) would be + # lost. + # + # So - need to figure out what CRS is used for... + # + # Peter commented on this in a PR comment: + # + # CRS stands for "Coordinate Reference System" which can be thought of + # as the projection system used for the polygons contained in the + # geodataframe. While it is not used in any of the graph operations of + # GerryChain, it may be used in things like validators and updaters. Since + # the CRS determines the projection system used by the underlying + # geodataframe, any area or perimeter computations encoded on the graph + # are stored with the understanding that those values may inherit + # distortions from projection used. We keep this around as metadata so + # that, in the event that the original geodataframe source is lost, + # the graph metadata still carries enough information for us to sanity + # check the area and perimeter computations if we get weird numbers. + + + # Store CRS data as an attribute of the NX graph + graph._nx_graph.graph["crs"] = df.crs.to_json() + return graph + + @classmethod + def from_geodataframe( + cls, + dataframe: pd.DataFrame, + adjacency: str = "rook", + cols_to_add: Optional[List[str]] = None, + reproject: bool = False, + ignore_errors: bool = False, + crs_override: Optional[Union[str, int]] = None, + ) -> "Graph": + + # frm: Changed to operate on a NetworkX.Graph object and then convert to a + # Graph object at the end of the function. + + """ + Creates the adjacency :class:`Graph` of geometries described by `dataframe`. + The areas of the polygons are included as node attributes (with key `area`). + The shared perimeter of neighboring polygons are included as edge attributes + (with key `shared_perim`). + Nodes corresponding to polygons on the boundary of the union of all the geometries + (e.g., the state, if your dataframe describes VTDs) have a `boundary_node` attribute + (set to `True`) and a `boundary_perim` attribute with the length of this "exterior" + boundary. + + By default, areas and lengths are computed in a UTM projection suitable for the + geometries. This prevents the bizarro area and perimeter values that show up when + you accidentally do computations in Longitude-Latitude coordinates. If the user + specifies `reproject=False`, then the areas and lengths will be computed in the + GeoDataFrame's current coordinate reference system. This option is for users who + have a preferred CRS they would like to use. + + :param dataframe: The GeoDateFrame to convert + :type dataframe: :class:`geopandas.GeoDataFrame` + :param adjacency: The adjacency type to use ("rook" or "queen"). + Default is "rook". + :type adjacency: str, optional + :param cols_to_add: The names of the columns that you want to + add to the graph as node attributes. Default is None. + :type cols_to_add: Optional[List[str]], optional + :param reproject: Whether to reproject to a UTM projection before + creating the graph. Default is ``False``. + :type reproject: bool, optional + :param ignore_errors: Whether to ignore all invalid geometries and + attept to create the graph anyway. Default is ``False``. + :type ignore_errors: bool, optional + :param crs_override: Value to override the CRS of the GeoDataFrame. + Default is None. + :type crs_override: Optional[Union[str,int]], optional + + :returns: The adjacency graph of the geometries from `dataframe`. + :rtype: Graph + """ + # Validate geometries before reprojection + if not ignore_errors: + invalid = invalid_geometries(dataframe) + if len(invalid) > 0: + raise GeometryError( + "Invalid geometries at rows {} before " + "reprojection. Consider repairing the affected geometries with " + "`.buffer(0)`, or pass `ignore_errors=True` to attempt to create " + "the graph anyways.".format(invalid) + ) + + # Project the dataframe to an appropriate UTM projection unless + # explicitly told not to. + if reproject: + df = reprojected(dataframe) + if ignore_errors: + invalid_reproj = invalid_geometries(df) + print(invalid_reproj) + if len(invalid_reproj) > 0: + raise GeometryError( + "Invalid geometries at rows {} after " + "reprojection. Consider reloading the GeoDataFrame with " + "`reproject=False` or repairing the affected geometries " + "with `.buffer(0)`.".format(invalid_reproj) + ) + else: + df = dataframe + + # Generate dict of dicts of dicts with shared perimeters according + # to the requested adjacency rule + adjacencies = neighbors(df, adjacency) # Note - this is adjacency.neighbors() + + # frm: Original Code: graph = cls(adjacencies) + nx_graph = networkx.Graph(adjacencies) + + # frm: TODO: Need to grok what geometry is used for - it is used in partition.py.plot() + # and maybe that is the only place it is used, but it is also used below + # to set other data, such as add_boundary_perimeters() and areas. The + # reason this is an issue is because I need to know what to carry over to + # the RX version of a Graph when I convert to RX when making a Partition. + # Partition.plot() uses this information, so it needs to be available in + # the RX version of a Graph - which essentially means that I need to grok + # how plot() works and where it gets its information and how existing + # users use it... + # + # There is a test failure due to geometry not being available after conversion to RX. + # + # Here is what Peter said in the PR: + # + # The geometry attribute on df is a special attribute that only appears on + # geodataframes. This is just a list of polygons representing some real-life + # geometries underneath a certain projection system (CRS). These polygons can + # then be fed to matplotilb to make nice plots of things, or they can be used + # to compute things like area and perimeter for use in updaters and validators + # that employ some sort of Reock score (uncommon, but unfortunately necessary in + # some jurisdictions). We probably don't need to store this as an attribute on + # the Graph._nxgraph object (or the Graph._rxgraph) object, however. In fact, it + # might be best to just make a Graph.dataframe attribute to store all of the + # graph data on, and add attributes to _nxgraph and _rxgraph nodes as needed + # + + nx_graph.geometry = df.geometry + + # frm: TODO: Rethink the name of add_boundary_perimeters - it acts on an nx_graph + # which seems wrong with the given name. Maybe it should be: + # add_boundary_perimeters_to_nx_graph() + # + # It raises the question of whether there should be an nx_utilities + # module for stuff designed to only work on nx_graph objects. + # + # Note that Peter said: "I like this idea" + # + + # Add "exterior" perimeters to the boundary nodes + add_boundary_perimeters(nx_graph, df.geometry) + + # Add area data to the nodes + areas = df.geometry.area.to_dict() + networkx.set_node_attributes(nx_graph, name="area", values=areas) + + if crs_override is not None: + df.set_crs(crs_override, inplace=True) + + if df.crs is None: + warnings.warn( + "GeoDataFrame has no CRS. Did you forget to set it? " + "If you're sure this is correct, you can ignore this warning. " + "Otherwise, please set the CRS using the `crs_override` parameter. " + "Attempting to proceed without a CRS." + ) + nx_graph.graph["crs"] = None + else: + nx_graph.graph["crs"] = df.crs.to_json() + + graph = cls.from_networkx(nx_graph) + + # frm: Moved from earlier in the function so that we would have a Graph + # object (vs. NetworkX.Graph object) + + graph.add_data(df, columns=cols_to_add) + graph.issue_warnings() + + return graph + + def lookup(self, node: Any, field: Any): + # Not quite sure why this routine existed in the original graph.py + # code, since most of the other code does not use it, and instead + # does graph.nodes[node_id][key] - back when a Graph was a subclass + # of NetworkX.Graph. + # + # It is left because a couple of other files use it (versioneer.py, + # county_splits.py, and tally.py) and because perhaps an end user also + # uses it. Leaving it does not significant harm - it is just code bloat... + + # frm: TODO: Remove this routine: def lookup() => in FrozenGraph too + # + # As per Peter's PR comment: + # + # Yeah, I will get rid of this in the future. This is very old code + # that someone probably wrote to make their life easier in the early + # stages of the package, but it's not really useful. I am going to be + # changing all of the old setup and versioning systems over to use UV + # anyway, and county_splits.py and tally.py are easy changes + # + + return self.node_data(node, field) + + @property + def node_indices(self): + self.verify_graph_is_valid() + + # frm: TODO: This does the same thing that graph.nodes does - returning a list of node_ids. + # Do we really want to support two ways of doing the same thing? + + if (self.is_nx_graph()): + return set(self._nx_graph.nodes) + elif (self.is_rx_graph()): + return set(self._rx_graph.node_indices()) + else: + raise Exception("Graph.node_indices - bad kind of graph object") + + @property + def edge_indices(self): + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + # A set of edge_ids (tuples) extracted from the graph's EdgeView + return set(self._nx_graph.edges) + elif (self.is_rx_graph()): + # A set of edge_ids for the edges + return set(self._rx_graph.edge_indices()) + else: + raise Exception("Graph.edges - bad kind of graph object") + + def get_edge_from_edge_id(self, edge_id): + """ + In NX, an edge_id is a tuple of node_ids, but in RX an edge_id + is an integer. To get the tuple of node_ids in RX, you need to + make a call using the edge_id. + + Stated differently, in NX an edge and an edge ID are the same, but + not in RX... + """ + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + # In NX, the edge_id is also the edge tuple + return edge_id + elif (self.is_rx_graph()): + # In RX, we need to go get the edge tuple + return self._rx_graph.edge_list()[edge_id] + else: + raise Exception("Graph.get_edge_from_edge_id - bad kind of graph object") + + def get_edge_id_from_edge(self, edge): + """ + Another case where we need to deal with the fact that in + NX an edge ID is a tuple of node_ids, where in RX an edge ID + is an integer assocaited with an edge. + """ + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + # In NX, the edge_id is also the edge tuple + return edge + elif (self.is_rx_graph()): + # In RX, we need to go get the edge_id from the edge tuple + # frm: TODO: Think about whether there is a better way to do this. I am + # worried that this might be expensive in terms of performance + # with large graphs. This is used in tree.py when seeing if a + # cut edge has a weight assigned to it. + # frm: Note that we sort both the edge_list and the edge, to canonicalize + # the edges so the smaller node_id is first. This allows us to not + # worry about whether the edge was (3,5) or (5,3) + + sorted_edge_list = sorted(list(self._rx_graph.edge_list())) + # frm: TODO: There has to be a more elegant way to do this... *sheesh* + sorted_edge = edge + if edge[0] > edge[1]: + sorted_edge = (edge[1], edge[0]) + return sorted_edge_list.index(sorted_edge) + else: + raise Exception("Graph.get_edge_id_from_edge - bad kind of graph object") + + @property + def nodes(self): + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + # A list of node_ids - + return list(self._nx_graph.nodes) + elif (self.is_rx_graph()): + # A list of integer node_ids + return list(self._rx_graph.node_indices()) + else: + raise Exception("Graph.sdges - bad kind of graph object") + + @property + def edges(self): + # frm: TODO: Confirm that this will work - returning different kinds of values + + """ + Edges are one of the areas where NX and RX differ. + + Conceptually an edge is just a tuple identifying the two nodes comprising the edge. + To be a little more specific, we will consider an edge to be a tuple of node_ids. + + But what is an edge_id? In NX, the edge_id is just the tuple of node_ids. I do + not know if NX is smart enough in an undirected graph to know that (3,4) is the same + as (4,3), but I assume that it is. In RX, however, the edge_id is just an integer. + Stated differently, in NX there is no difference between an "edge" and an "edge_id", + but in RX there is. + + So, the new Graph object is going to distinguish between edges and edge_ids. + Graph.edges will return a set of tuples in both cases, and Graph.edge_indices will + return a set of edge_ids in both cases. This is a little funky as the return type + for Graph.edge_indices will be structurally different for NX and RX version of Graph + objects, but hey - this is Python, so why not? Sorry for the snide attack... + + Another issue (that should probably be documented elsewhere instead of here) is that + in NX, Graph.edges returns an EdgeView object which allows for access to several + different bits of information about edges. If you iterate over Graph.edges you + get a sequence of tuples for the edges, but if you use square bracket notation, + as in: Graph.edges[(n1, n2)] you get access to the data dictionary for the edge. + + Here are some examples: + + for e in nx_graph.edges: + print("This edge goes between the following nodes: ", e) + + The above will print out all of the edge_id tuples: + + This edge goes between nodes: (46, 47) + This edge goes between nodes: (47, 55) + This edge goes between nodes: (48, 56) + This edge goes between nodes: (48, 49) + ... + + However, if you want to get the data dictionary associated with the edge that goes + between nodes 46, and 47, then you can do: + + print("node: (46,47) has data: ", nx_graph.edges[(46,47)]) + + node: (46,47) has data: {'weight': 5.5, 'total_population': 123445} + + RX does not support the EdgeView object, so we will use the same approach as for nodes. + To get access to an edge's data dictionary, one will need to use the new function, + edge_data(edge_id) - where edge_id will be either a tuple or an integer depending + on what flavor of Graph is being operated on. + """ + + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + # A set of tuples extracted from the graph's EdgeView + return set(self._nx_graph.edges) + elif (self.is_rx_graph()): + # A set of tuples for the edges + return set(self._rx_graph.edge_list()) + else: + raise Exception("Graph.edges - bad kind of graph object") + + def add_edge(self, node_id1, node_id2): + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + self._nx_graph.add_edge(node_id1, node_id2) + elif (self.is_rx_graph()): + # empty dict tells RX the edge data will be a dict + self._rx_graph.add_edge(node_id1, node_id2, {}) + else: + raise Exception("Graph.add_edge - bad kind of graph object") + + def get_edge_tuple(self, edge_id): + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + # In NX, the edge_id is already a tuple with the two node_ids + return edge_id + elif (self.is_rx_graph()): + return self._rx_graph.edge_list()[edge_id] + else: + raise Exception("Graph.get_edge_tuple - bad kind of graph object") + + def add_data( + self, df: pd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> None: + """ + Add columns of a DataFrame to a graph as node attributes + by matching the DataFrame's index to node ids. + + :param df: Dataframe containing given columns. + :type df: :class:`pandas.DataFrame` + :param columns: List of dataframe column names to add. Default is None. + :type columns: Optional[Iterable[str]], optional + + :returns: None + """ + + if not (self.is_nx_graph()): + raise Exception("Graph.add_data only valid for NetworkX based graphs") + + if columns is None: + columns = list(df.columns) + + check_dataframe(df[columns]) + + # Create dict: {node_id: {attr_name: attr_value}} + column_dictionaries = df.to_dict("index") + nx_graph = self._nx_graph + networkx.set_node_attributes(nx_graph, column_dictionaries) + + if hasattr(nx_graph, "data"): + nx_graph.data[columns] = df[columns] # type: ignore + else: + nx_graph.data = df[columns] + + + def join( + self, + dataframe: pd.DataFrame, + columns: Optional[List[str]] = None, + left_index: Optional[str] = None, + right_index: Optional[str] = None, + ) -> None: + """ + Add data from a dataframe to the graph, matching nodes to rows when + the node's `left_index` attribute equals the row's `right_index` value. + + :param dataframe: DataFrame. + :type dataframe: :class:`pandas.DataFrame` + :columns: The columns whose data you wish to add to the graph. + If not provided, all columns are added. Default is None. + :type columns: Optional[List[str]], optional + :left_index: The node attribute used to match nodes to rows. + If not provided, node IDs are used. Default is None. + :type left_index: Optional[str], optional + :right_index: The DataFrame column name to use to match rows + to nodes. If not provided, the DataFrame's index is used. Default is None. + :type right_index: Optional[str], optional + + :returns: None + """ + if right_index is not None: + df = dataframe.set_index(right_index) + else: + df = dataframe + + if columns is not None: + df = df[columns] + + check_dataframe(df) + + column_dictionaries = df.to_dict() + + if not self.is_nx_graph(): + raise Exception("Graph.join only valid for NetworkX based Graph objects") + nx_graph = self._nx_graph + + if left_index is not None: + ids_to_index = networkx.get_node_attributes(nx_graph, left_index) + else: + # When the left_index is node ID, the matching is just + # a redundant {node: node} dictionary + # frm: TODO: don't think self.nodes works for RX... + ids_to_index = dict(zip(self.nodes, self.nodes)) + + node_attributes = { + node_id: { + column: values[index] for column, values in column_dictionaries.items() + } + for node_id, index in ids_to_index.items() + } + + networkx.set_node_attributes(nx_graph, node_attributes) + + @property + def islands(self): + # Return all nodes of degree 0 (those not connected in an edge to another node) + return set(node for node in self.node_indices if self.degree(node) == 0) + + def is_directed(self): + # frm TODO: Get rid of this hack. I added it because code in contiguity.py + # called nx.is_connected() which eventually called is_directed() + # assuming the graph was an nx_graph. + return False + + def warn_for_islands(self) -> None: + islands = self.islands + if len(self.islands) > 0: + warnings.warn( + "Found islands (degree-0 nodes). Indices of islands: {}".format(islands) + ) + + def issue_warnings(self) -> None: + self.warn_for_islands() + + # frm TODO: Implement a FrozenGraph that supports RX... + # self.graph.join = frozen + # self.graph.add_data = frozen + # self.size = len(self.graph) + + def __len__(self) -> int: + # Relies on self.node_indices to work on both NX and RX + return len(self.node_indices) + + def __getattr__(self, __name: str) -> Any: + # frm: TODO: Get rid of this eventually - it is very dangerous... + + # frm: Interesting bug lurking if __name is "nx_graph". This occurs when legacy code + # uses the default constructor, Graph(), and then references a built-in NX + # Graph method, such as my_graph.add_edges(). In this case the built-in NX + # Graph method is not defined, so __getattr__() is called to try to figure out + # what it could be. This triggers the call below to self.is_nx_graph(), which + # references self._nx_graph (which is undefined/None) which triggers another + # call to __getattr__() which is BAD... + # + # I think the solution is to not rely on testing whether nx_graph and rx_graph + # are None - but rather to have explicit is_nx_or_rx_graph data member which + # is set to one of "NX", "RX", "not_set". + # + # For now, I am just going to return None if __name is "_nx_graph" or "_rx_graph". + # + # Peter's comments from PR: + # + # Oh interesting; good catch! The flag approach seems like a good solution to me. + # It's very, very rare to use the default constructor, so I don't imagine that + # people will really run into this. + + # frm: TODO: Fix this hack - see comment above... + if (__name == "_nx_graph") or (__name == "_rx_graph"): + return None + + # If attribute doesn't exist on this object, try + # its underlying graph object... + if (self.is_nx_graph()): + return object.__getattribute__(self._nx_graph, __name) + elif (self.is_rx_graph()): + return object.__getattribute__(self._rx_graph, __name) + else: + raise Exception("Graph.__getattribute__ - bad kind of graph object") + + def __getitem__(self, __name: str) -> Any: + # frm: ???: TODO: Does any of the code actually use this? + # It is a clever Python way to use square bracket + # notation to access something (anything) you want. + # + # In this case, it returns the NetworkX AtlasView + # of neighboring nodes - looks like a dictionary + # with a key of the neighbor node_id and a value + # with the neighboring node's data (another dict). + # + # I am guessing that it is only ever used to get + # a list of the neighbor node_ids, in which case + # it is functionally equivalent to self.neighbors(). + # + # *sigh* + # + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + return self._nx_graph[__name] + elif (self.is_rx_graph()): + # frm TODO: + raise Exception("Graph.__getitem__() NYI for RX") + else: + raise Exception("Graph.__getitem__() - bad kind of graph object") + + def __iter__(self) -> Iterable[Any]: + # frm: TODO: Verify that this does the right thing... + # It seems to do the right thing - iterating over node_ids which + # works so long as NX uses integers for node_ids. + # frm: TODO: Perhaps I should test for non-integer node_ids in NX graphs and issue a warning... + # In any event, this deserves thought: what to do for NX graphs that do not use + # integers for node_ids? + yield from self.node_indices + + def subgraph(self, nodes: Iterable[Any]) -> "Graph": + """ + frm: RX Documentation: + + Subgraphs are one of the biggest differences between NX and RX, because RX creates new + node_ids for the nodes in the subgraph, starting at 0. So, if you create a subgraph with + a list of nodes: [45, 46, 47] the nodes in the subgraph will be [0, 1, 2]. + + This creates problems for functions that operate on subgraphs and want to return results + involving node_ids to the caller. To solve this, we define a _node_id_to_parent_node_id_map whenever + we create a subgraph that will provide the node_id in the parent for each node in the subgraph. + For NX this is a no-op, and the _node_id_to_parent_node_id_map is just an identity map - each node_id is + mapped to itself. For RX, however, we store the parent_node_id in the node's data before + creating the subgraph, and then in the subgraph, we use the parent's node_id to construct + a map from the subgraph node_id to the parent_node_id. + + This means that any function that wants to return results involving node_ids can safely + just translate node_ids using the _node_id_to_parent_node_id_map, so that the results make sense in + the caller's context. + + A note of caution: if the caller retains the subgraph after using it in a function call, + the caller should almost certainly not use the node_ids in the subgraph for ANYTHING. + It would be safest to reset the value of the subgraph to None after using it as an + argument to a function call. + + Also, for both RX and NX, we set the _node_id_to_parent_node_id_map to be the identity map for top-level + graphs on the off chance that there is a function that takes both top-level graphs and + subgraphs as a parameter. This allows the function to just always do the node translation. + In the case of a top-level graph the translation will be a no-op, but it will be correct. + + Also, we set the _is_a_subgraph = True, so that we can detect whether a parameter passed into + a function is a top-level graph or not. This will allow us to debug the code to determine + if assumptions about a parameter always being a subgraph is accurate. It also helps to + educate future readers of the code that subgraphs are "interesting"... + + """ + + self.verify_graph_is_valid() + + new_subgraph = None + + if (self.is_nx_graph()): + nx_subgraph = self._nx_graph.subgraph(nodes) + new_subgraph = self.from_networkx(nx_subgraph) + # for NX, the node_ids in subgraph are the same as in the parent graph + _node_id_to_parent_node_id_map = {node: node for node in nodes} + _node_id_to_original_node_id_map = {node: node for node in nodes} + elif (self.is_rx_graph()): + # frm TODO: Need to check logic below - not sure this works exactly correctly for RX... + if isinstance(nodes, frozenset) or isinstance(nodes, set): + nodes = list(nodes) + # For RX, the node_ids in the subgraph change, so we need a way to map subgraph node_ids + # into parent graph node_ids. To do so, we add the parent node_id into the node data + # so that in the subgraph we can find it and then create the map. + # frm: TODO: Be careful - node data is shared by subgraphs, so a subgraph of this + # subgraph will still have this field set - meaning that the field's + # value is not dependable over time - perhaps I should null it out + # after using it here... + for node_id in nodes: + self.node_data(node_id)["parent_node_id"] = node_id + + # frm: TODO: Since data is shared by nodes in subgraphs, perhaps we could just set + # the "original_node_id" in the beginning and rely on it forever... + for node_id in nodes: + self.node_data(node_id)["original_node_id"] = self._node_id_to_original_node_id_map[node_id] + + rx_subgraph = self._rx_graph.subgraph(nodes) + new_subgraph = self.from_rustworkx(rx_subgraph) + + # frm: Create the map from subgraph node_id to parent graph node_id + _node_id_to_parent_node_id_map = {} + for subgraph_node_id in new_subgraph.node_indices: + _node_id_to_parent_node_id_map[subgraph_node_id] = new_subgraph.node_data(subgraph_node_id)["parent_node_id"] + # frm: Create the map from subgraph node_id to the original graph's node_id + _node_id_to_original_node_id_map = {} + for subgraph_node_id in new_subgraph.node_indices: + _node_id_to_original_node_id_map[subgraph_node_id] = new_subgraph.node_data(subgraph_node_id)["original_node_id"] + + else: + raise Exception("Graph.subgraph - bad kind of graph object") + + new_subgraph._is_a_subgraph = True + new_subgraph._node_id_to_parent_node_id_map = _node_id_to_parent_node_id_map + new_subgraph._node_id_to_original_node_id_map = _node_id_to_original_node_id_map + + return new_subgraph + + def translate_subgraph_node_ids_for_flips(self, flips): + # flips is a dictionary mapping node_ids to parts (districts). + translated_flips = {} + for subgraph_node_id, part in flips.items(): + parent_node_id = self._node_id_to_parent_node_id_map[subgraph_node_id] + translated_flips[parent_node_id] = part + + return translated_flips + + def translate_subgraph_node_ids_for_set_of_nodes(self, set_of_nodes): + translated_set_of_nodes = set() + for node_id in set_of_nodes: + translated_set_of_nodes.add(self._node_id_to_parent_node_id_map[node_id]) + return translated_set_of_nodes + + def nx_generic_bfs_edges(self, source, neighbors=None, depth_limit=None): + # frm: Code copied from GitHub: + # + # https://github.com/networkx/networkx/blob/main/networkx/algorithms/traversal/breadth_first_search.py + # + # Code was not modified - it worked as written for both rx.PyGraph and a graph.Graph object + # with an RX graph embedded in it... + + """Iterate over edges in a breadth-first search. + + The breadth-first search begins at `source` and enqueues the + neighbors of newly visited nodes specified by the `neighbors` + function. + + Parameters + ---------- + G : RustworkX.PyGraph object (not a NetworkX graph) + + source : node + Starting node for the breadth-first search; this function + iterates over only those edges in the component reachable from + this node. + + neighbors : function + A function that takes a newly visited node of the graph as input + and returns an *iterator* (not just a list) of nodes that are + neighbors of that node with custom ordering. If not specified, this is + just the ``G.neighbors`` method, but in general it can be any function + that returns an iterator over some or all of the neighbors of a + given node, in any order. + + depth_limit : int, optional(default=len(G)) + Specify the maximum search depth. + + Yields + ------ + edge + Edges in the breadth-first search starting from `source`. + + Examples + -------- + >>> G = nx.path_graph(7) + >>> list(nx.generic_bfs_edges(G, source=0)) + [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6)] + >>> list(nx.generic_bfs_edges(G, source=2)) + [(2, 1), (2, 3), (1, 0), (3, 4), (4, 5), (5, 6)] + >>> list(nx.generic_bfs_edges(G, source=2, depth_limit=2)) + [(2, 1), (2, 3), (1, 0), (3, 4)] + + The `neighbors` param can be used to specify the visitation order of each + node's neighbors generically. In the following example, we modify the default + neighbor to return *odd* nodes first: + + >>> def odd_first(n): + ... return sorted(G.neighbors(n), key=lambda x: x % 2, reverse=True) + + >>> G = nx.star_graph(5) + >>> list(nx.generic_bfs_edges(G, source=0)) # Default neighbor ordering + [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5)] + >>> list(nx.generic_bfs_edges(G, source=0, neighbors=odd_first)) + [(0, 1), (0, 3), (0, 5), (0, 2), (0, 4)] + + Notes + ----- + This implementation is from `PADS`_, which was in the public domain + when it was first accessed in July, 2004. The modifications + to allow depth limits are based on the Wikipedia article + "`Depth-limited-search`_". + + .. _PADS: http://www.ics.uci.edu/~eppstein/PADS/BFS.py + .. _Depth-limited-search: https://en.wikipedia.org/wiki/Depth-limited_search + """ + # frm: These two if-stmts work for both rx.PyGraph and gerrychain.Graph with RX inside + if neighbors is None: + neighbors = self.neighbors + if depth_limit is None: + depth_limit = len(self) + + seen = {source} + n = len(self) + depth = 0 + next_parents_children = [(source, neighbors(source))] + while next_parents_children and depth < depth_limit: + this_parents_children = next_parents_children + next_parents_children = [] + for parent, children in this_parents_children: + for child in children: + # frm: avoid cycles - don't process a child twice... + if child not in seen: + seen.add(child) + # frm: add this node's children to list to be processed later... + next_parents_children.append((child, neighbors(child))) + yield parent, child + if len(seen) == n: + return + depth += 1 + + # frm: TODO: Add tests for all of the new routines I have added... + + def generic_bfs_successors_generator(self, root_node_id): + # frm: Generate in sequence a tuple for the parent (node_id) and + # the children of that node (list of node_ids). + parent = root_node_id + children = [] + for p, c in self.nx_generic_bfs_edges(root_node_id): + # frm: parent-child pairs appear ordered by their parent, so + # we can collect all of the children for a node by just + # iterating through pairs until the parent changes. + if p == parent: + children.append(c) + continue + yield (parent, children) + # new parent, so reset parent and children variables to + # be the new parent (p) and a new children list containing + # this first child (c), and continue looping + children = [c] + parent = p + yield (parent, children) + + def generic_bfs_successors(self, root_node_id): + return dict(self.generic_bfs_successors_generator(root_node_id)) + + def generic_bfs_predecessors(self, root_node_id): + # frm Note: We had do implement our own, because the built-in RX version only worked + # for directed graphs. + predecessors = [] + for s, t in self.nx_generic_bfs_edges(root_node_id): + predecessors.append((t,s)) + return dict(predecessors) + + + def predecessors(self, root_node_id): + + """ + frm: It took me a while to grok what predecessors() and successors() + were all about. In the end, it was simple - they are just the + parents and the children of a tree that "starts" at the given root + node. + + What took me a while to understand is that this effectively + converts an undirected cyclic graph into a DAG. What is clever is + that as soon as it detects a cycle it stops traversing the graph. + The other thing that is clever is that the DAG that is created + either starts at the top or the bottom. For successors(), the + DAG starts at the top, so that the argument to successors() is + the root of the tree. However, in the case of predecessors() + the argument to predecessors() is a leaf node, and the "tree" + can have multiple "roots". + + In both cases, you can ask what the associated parent or + children are of any node in the graph. If you ask for the + successors() you will get a list of the children nodes. + If you ask for the predecessors() you will get the single + parent node. + + I think that the successors() graph is deterministic (except + for the order of the child nodes), meaning that for a given + graph no matter what order you created nodes and added edges, + you will get the same set of children for a given node. + However, for predecessors(), there are many different + DAGs that might be created depending on which edge the + algorithm decides is the single parent. + + All of this is interesting, but I have not yet spent the + time to figure out why it matters in the code. + + TODO: The code in NetworkX for bfs_successors() and bfs_predecessors() + works on undirected graphs (with cleverness to cut cycles), but + the same named routines in RX only operate on directed graphs, + so there is work to be done to make this functionality work + for RX... + """ + + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + return {a: b for a, b in networkx.bfs_predecessors(self._nx_graph, root_node_id)} + elif (self.is_rx_graph()): + return self.generic_bfs_predecessors(root_node_id) + else: + raise Exception("Graph.predecessors - bad kind of graph object") + + def successors(self, root_node_id): + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + return {a: b for a, b in networkx.bfs_successors(self._nx_graph, root_node_id)} + elif (self.is_rx_graph()): + return self.generic_bfs_successors(root_node_id) + else: + raise Exception("Graph.successors - bad kind of graph object") + + def neighbors(self, node): + self.verify_graph_is_valid() + + # NX neighbors() returns a which iterates over the node_ids of neighbor nodes + # RX neighbors() returns a NodeIndices object with the list of node_ids of neighbor nodes + # However, the code outside graph.py only ever iterates over all neighbors so returning a list works... + if (self.is_nx_graph()): + return list(self._nx_graph.neighbors(node)) + elif (self.is_rx_graph()): + return list(self._rx_graph.neighbors(node)) + else: + raise Exception("Graph.neighbors - bad kind of graph object") + + def degree(self, node: Any) -> int: + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + return self._nx_graph.degree(node) + elif (self.is_rx_graph()): + return self._rx_graph.degree(node) + else: + raise Exception("Graph.degree - bad kind of graph object") + + def node_data(self, node_id): + # This routine returns the data dictionary for the given node's data + + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + data_dict = self._nx_graph.nodes[node_id] + elif (self.is_rx_graph()): + data_dict = self._rx_graph[node_id] + else: + raise Exception("Graph.node_data - bad kind of graph object") + + if not isinstance(data_dict, dict): + raise Exception("node data is not a dictionary"); + + return data_dict + + def edge_data(self, edge_id): + # This routine returns the data dictionary for the given edge's data + + """ + CLEVERNESS ALERT! + + The type of the edge_id parameter will be a tuple in the case of an + embedded NX graph but will be an integer in the case of an RX embedded + graph. + + """ + + self.verify_graph_is_valid() + + if (self.is_nx_graph()): + data_dict = self._nx_graph.edges[edge_id] + elif (self.is_rx_graph()): + data_dict = self._rx_graph.edges()[edge_id] + else: + raise Exception("Graph.node_data - bad kind of graph object") + + if not isinstance(data_dict, dict): + raise Exception("node data is not a dictionary"); + + return data_dict + + + # frm: Note: I added the laplacian_matrix routines as methods of the Graph + # class because they are only ever used on Graph objects. It + # bloats the Graph class, but it still seems like the best + # option. + + def laplacian_matrix(self): + # A local "gc" (as in GerryChain) version of the laplacian matrix + + # frm: TODO: The NX version returns a matrix of integer values while the + # RX version returns a matrix of floating point values. I + # think the reason is that the RX.adjacency_matrix() call + # returns an array of floats. + # + # Since the laplacian matrix is used for further numeric + # processing, I don't think this matters, but I should + # check to be 100% certain. + + if self.is_nx_graph(): + nx_graph = self._nx_graph + laplacian_matrix = networkx.laplacian_matrix(nx_graph) + elif self.is_rx_graph(): + rx_graph = self._rx_graph + # 1. Get the adjacency matrix + adj_matrix = rustworkx.adjacency_matrix(rx_graph) + # 2. Calculate the degree matrix (simplified for this example) + degree_matrix = np.diag([rx_graph.degree(node) for node in rx_graph.node_indices()]) + # 3. Calculate the Laplacian matrix + np_laplacian_matrix = degree_matrix - adj_matrix + # 4. Convert the NumPy array to a csr_array + laplacian_matrix = csr_array(np_laplacian_matrix) + else: + raise Exception("laplacian_matrix: badly configured graph parameter") + + return laplacian_matrix + + def normalized_laplacian_matrix(self): + if self.is_nx_graph(): + nx_graph = self._nx_graph + laplacian_matrix = networkx.normalized_laplacian_matrix(nx_graph) + elif self.is_rx_graph(): + # frm: TODO: Implement normalized_laplacian_matrix() for RX + rx_graph = self._rx_graph + raise Exception("normalized_laplacian_matrix NYI for RustworkX based Graph objects") + else: + raise Exception("normalized_laplacian_matrix: badly configured graph parameter") + + return laplacian_matrix + + def subgraphs_for_connected_components(self): + # Create a list of subgraphs - one for each subset of connected nodes in the graph + # + # This mirrors the nx.connected_components() routine in NetworkX + + if self.is_nx_graph(): + nx_graph = self.get_nx_graph() + subgraphs = [ + self.subgraph(nodes) for nodes in networkx.connected_components(nx_graph) + ] + elif self.is_rx_graph(): + rx_graph = self.get_rx_graph() + subgraphs = [ + self.subgraph(nodes) for nodes in rustworkx.connected_components(rx_graph) + ] + else: + raise Exception("subgraphs_for_connected_components: Bad kind of Graph") + + return subgraphs + + def num_connected_components(self): + if self.is_nx_graph(): + nx_graph = self.get_nx_graph() + connected_components = list(networkx.connected_components(nx_graph)) + elif self.is_rx_graph(): + rx_graph = self.get_rx_graph() + connected_components = rustworkx.connected_components(rx_graph) + else: + raise Exception("num_connected_components: Bad kind of Graph") + + num_cc = len(connected_components) + return num_cc +###################################################### + +class OriginalGraph(networkx.Graph): + """ + frm: This is the original code for gerrychain.Graph before any RustworkX changes. + + It continues to exist so that I can write tests to verify that from the outside + the new Graph object behaves the same as the original Graph object. + + See the test in tests/frm_tests/test_frm_old_vs_new_graph.py """ + # frm: Original Graph code... def __repr__(self): return "".format(len(self.nodes), len(self.edges)) + # frm: Original Graph code... @classmethod def from_networkx(cls, graph: networkx.Graph) -> "Graph": """ @@ -73,6 +1453,7 @@ def from_networkx(cls, graph: networkx.Graph) -> "Graph": g = cls(graph) return g + # frm: Original Graph code... @classmethod def from_json(cls, json_file: str) -> "Graph": """ @@ -91,6 +1472,7 @@ def from_json(cls, json_file: str) -> "Graph": graph.issue_warnings() return graph + # frm: Original Graph code... def to_json( self, json_file: str, *, include_geometries_as_geojson: bool = False ) -> None: @@ -118,6 +1500,7 @@ def to_json( with open(json_file, "w") as f: json.dump(data, f, default=json_serialize) + # frm: Original Graph code... @classmethod def from_file( cls, @@ -161,7 +1544,6 @@ def from_file( or install ``geopandas`` separately. """ - import geopandas as gp df = gp.read_file(filename) graph = cls.from_geodataframe( @@ -171,9 +1553,11 @@ def from_file( reproject=reproject, ignore_errors=ignore_errors, ) + graph.graph["crs"] = df.crs.to_json() return graph + # frm: Original Graph code... @classmethod def from_geodataframe( cls, @@ -252,7 +1636,7 @@ def from_geodataframe( # Generate dict of dicts of dicts with shared perimeters according # to the requested adjacency rule - adjacencies = neighbors(df, adjacency) + adjacencies = neighbors(df, adjacency) # Note - this is adjacency.neighbors() graph = cls(adjacencies) graph.geometry = df.geometry @@ -284,6 +1668,7 @@ def from_geodataframe( return graph + # frm: Original Graph code... def lookup(self, node: Any, field: Any) -> Any: """ Lookup a node/field attribute. @@ -298,14 +1683,17 @@ def lookup(self, node: Any, field: Any) -> Any: """ return self.nodes[node][field] + # frm: Original Graph code... @property def node_indices(self): return set(self.nodes) + # frm: Original Graph code... @property def edge_indices(self): return set(self.edges) + # frm: Original Graph code... def add_data( self, df: pd.DataFrame, columns: Optional[Iterable[str]] = None ) -> None: @@ -334,6 +1722,7 @@ def add_data( else: self.data = df[columns] + # frm: Original Graph code... def join( self, dataframe: pd.DataFrame, @@ -345,6 +1734,11 @@ def join( Add data from a dataframe to the graph, matching nodes to rows when the node's `left_index` attribute equals the row's `right_index` value. + This is the same as a "join" in SQL: + insert into + select <> from + where . + :param dataframe: DataFrame. :type dataframe: :class:`pandas.DataFrame` :columns: The columns whose data you wish to add to the graph. @@ -359,6 +1753,14 @@ def join( :returns: None """ + # frm: TODO: Implement this for RX. Note, however, that this is probably + # low priority since this routine is for building a graph + # which for now (summer 2025) will continue to be done using + # NetworkX. That is, this code will not be used after + # freezing the graph when we create a Parition... + if (not self.is_nx_graph()): + raise Exception("join(): Not supported for RX based Graph objects") + if right_index is not None: df = dataframe.set_index(right_index) else: @@ -369,15 +1771,27 @@ def join( check_dataframe(df) + # Transform the dataframe into a dict of dicts, where + # each column in the df is associated with a dict of + # : values. column_dictionaries = df.to_dict() + # Determine what data in the graph to sync up with the + # data in the dataframe. ids_to_index maps node_ids to + # values that select which row in dataframe should be + # associated with that node_id. if left_index is not None: + # frm: TODO: Figure out how to make this work for RX... ids_to_index = networkx.get_node_attributes(self, left_index) else: # When the left_index is node ID, the matching is just # a redundant {node: node} dictionary ids_to_index = dict(zip(self.nodes, self.nodes)) + # For each column in the dataframe, extract the appropriate entry for + # the given node_id (using index) and wrap it all up in a dict. The + # result is a dict of (name, value) pairs of data from the dataframe + # for each node_id. node_attributes = { node_id: { column: values[index] for column, values in column_dictionaries.items() @@ -385,8 +1799,10 @@ def join( for node_id, index in ids_to_index.items() } + # frm: TODO: Figure out how to make this work for RX... networkx.set_node_attributes(self, node_attributes) + # frm: Original Graph code... @property def islands(self) -> Set: """ @@ -395,6 +1811,7 @@ def islands(self) -> Set: """ return set(node for node in self if self.degree[node] == 0) + # frm: Original Graph code... def warn_for_islands(self) -> None: """ :returns: None @@ -407,6 +1824,7 @@ def warn_for_islands(self) -> None: "Found islands (degree-0 nodes). Indices of islands: {}".format(islands) ) + # frm: Original Graph code... def issue_warnings(self) -> None: """ :returns: None @@ -415,8 +1833,7 @@ def issue_warnings(self) -> None: """ self.warn_for_islands() - -def add_boundary_perimeters(graph: Graph, geometries: pd.Series) -> None: +def add_boundary_perimeters(nx_graph: networkx.Graph, geometries: pd.Series) -> None: """ Add shared perimeter between nodes and the total geometry boundary. @@ -428,23 +1845,31 @@ def add_boundary_perimeters(graph: Graph, geometries: pd.Series) -> None: :returns: The updated graph. :rtype: Graph """ - from shapely.ops import unary_union - from shapely.prepared import prep + + # frm: TODO: Think about whether it is reasonable to require this to work + # on an NetworkX.Graph object. + + # frm: The original code operated on the Graph object which was a subclass of + # NetworkX.Graph. I have changed it to operate on a NetworkX.Graph object + # with the understanding that callers will reach down into a Graph object + # and pass in the inner nx_graph data member. + + if not(isinstance(nx_graph, networkx.Graph)): + raise Exception("add_boundary_permiters: Graph is not a NetworkX.Graph object") prepared_boundary = prep(unary_union(geometries).boundary) boundary_nodes = geometries.boundary.apply(prepared_boundary.intersects) - for node in graph: - graph.nodes[node]["boundary_node"] = bool(boundary_nodes[node]) + for node in nx_graph: + nx_graph.nodes[node]["boundary_node"] = bool(boundary_nodes[node]) if boundary_nodes[node]: total_perimeter = geometries[node].boundary.length shared_perimeter = sum( - neighbor_data["shared_perim"] for neighbor_data in graph[node].values() + neighbor_data["shared_perim"] for neighbor_data in nx_graph[node].values() ) boundary_perimeter = total_perimeter - shared_perimeter - graph.nodes[node]["boundary_perim"] = boundary_perimeter - + nx_graph.nodes[node]["boundary_perim"] = boundary_perimeter def check_dataframe(df: pd.DataFrame) -> None: """ @@ -524,6 +1949,15 @@ class FrozenGraph: The class uses `__slots__` for improved memory efficiency. """ + # frm: TODO: Rename the internal data member, "graph", to be something else. + # The reason is that a NetworkX.Graph object already has an internal + # data member named, "graph", which is just a dict for the data + # associated with the Networkx.Graph object. + # + # So to avoid confusion, naming the frozen graph something like + # _frozen_graph would make it easier for a future reader of the + # code to avoid confusion... + __slots__ = ["graph", "size"] def __init__(self, graph: Graph) -> None: @@ -535,11 +1969,22 @@ def __init__(self, graph: Graph) -> None: :returns: None """ - self.graph = networkx.classes.function.freeze(graph) - self.graph.join = frozen - self.graph.add_data = frozen - self.size = len(self.graph) + # frm: Original code follows: + # + # self.graph = networkx.classes.function.freeze(graph) + # + # # frm: frozen is just a function that raises an exception if called... + # self.graph.join = frozen + # self.graph.add_data = frozen + # + # self.size = len(self.graph) + + # frm TODO: Add logic to have this work for RX. + + self.graph = graph + # frm: TODO: Not sure this works for RX + self.size = len(self.graph.nodes) def __len__(self) -> int: return self.size @@ -548,7 +1993,8 @@ def __getattribute__(self, __name: str) -> Any: try: return object.__getattribute__(self, __name) except AttributeError: - return object.__getattribute__(self.graph, __name) + # delegate getting the attribute to the graph data member + return self.graph.__getattribute__(__name) def __getitem__(self, __name: str) -> Any: return self.graph[__name] @@ -558,7 +2004,9 @@ def __iter__(self) -> Iterable[Any]: @functools.lru_cache(16384) def neighbors(self, n: Any) -> Tuple[Any, ...]: - return tuple(self.graph.neighbors(n)) + # frm: Original Code: + # return tuple(self.graph.neighbors(n)) + return self.graph.neighbors(n) @functools.cached_property def node_indices(self) -> Iterable[Any]: @@ -574,7 +2022,8 @@ def degree(self, n: Any) -> int: @functools.lru_cache(65536) def lookup(self, node: Any, field: str) -> Any: - return self.graph.nodes[node][field] + # frm: Original Code: return self.graph.nodes[node][field] + return self.node_data(node)[field] def subgraph(self, nodes: Iterable[Any]) -> "FrozenGraph": return FrozenGraph(self.graph.subgraph(nodes)) diff --git a/gerrychain/grid.py b/gerrychain/grid.py index b635c807..c2455d64 100644 --- a/gerrychain/grid.py +++ b/gerrychain/grid.py @@ -12,8 +12,19 @@ - typing: Used for type hints. """ + import math import networkx +# frm TODO: Decide whether to leave grid.py as-is, at least for now. +# While it imports NetworkX, it eventually creates a new +# Graph object which is added to a Partition which will +# eventually "freeze" and convert the new Graph object to +# be based on RX (under the covers). +# +# So, this can be thought of as legacy code that works just +# fine. In the future if we want to go full RX everywhere +# we can decide what to do. +# from gerrychain.partition import Partition from gerrychain.graph import Graph from gerrychain.updaters import ( @@ -62,6 +73,20 @@ def __init__( assignment: Optional[Dict] = None, updaters: Optional[Dict[str, Callable]] = None, parent: Optional["Grid"] = None, + # frm: ???: TODO: This code indicates that flips are a dict of tuple: int which would be + # correct for edge flips, but not for node flips. Need to check again + # to see if this is correct. Note that flips is used in the constructor + # so it should fall through to Partition._from_parent()... + # + # OK - I think that this is a bug. Parition._from_parent() assumes + # that flips are a mapping from node to partition not tuple/edge to partition. + # I checked ALL of the code and the constructor for Grid is never passed in + # a flips parameter, so there is no example to check / verify, but it sure + # looks and smells like a bug. + # + # The fix would be to just change Dict[Tuple[int, int], int] to be + # Dict[int, int] + # flips: Optional[Dict[Tuple[int, int], int]] = None, ) -> None: """ @@ -95,9 +120,12 @@ def __init__( :raises Exception: If neither dimensions nor parent is provided. """ + + # Note that Grid graphs have node_ids that are tuples not integers. + if dimensions: self.dimensions = dimensions - graph = Graph.from_networkx(create_grid_graph(dimensions, with_diagonals)) + graph = Graph.from_networkx(_create_grid_nx_graph(dimensions, with_diagonals)) if not assignment: thresholds = tuple(math.floor(n / 2) for n in self.dimensions) @@ -139,7 +167,20 @@ def as_list_of_lists(self): return [[self.assignment.mapping[(i, j)] for i in range(m)] for j in range(n)] -def create_grid_graph(dimensions: Tuple[int, int], with_diagonals: bool) -> Graph: +# frm TODO: Is this intended to be callable / useful for external users? +# For now, I am going to leave this as operating on NetworkX graphs, since +# it appears to only be used internally in this Class. However, I may discover +# that it has been used externally with the intention of returning a Graph object. +# If so, then I will need to return a Graph object (from_networkx(nx_graphg)) and change +# the call inside this class to expect a Graph object instead of a NetworkX.Graph object. + +# frm: TODO: Decide if I should change this to return a Graph object or not... + +# frm: Original Code - function signature: +# def create_grid_graph(dimensions: Tuple[int, int], with_diagonals: bool) -> Graph: +# + +def _create_grid_nx_graph(dimensions: Tuple[int, int], with_diagonals: bool) -> Graph: """ Creates a grid graph with the specified dimensions. Optionally includes diagonal connections between nodes. @@ -157,9 +198,9 @@ def create_grid_graph(dimensions: Tuple[int, int], with_diagonals: bool) -> Grap if len(dimensions) != 2: raise ValueError("Expected two dimensions.") m, n = dimensions - graph = networkx.generators.lattice.grid_2d_graph(m, n) + nx_graph = networkx.generators.lattice.grid_2d_graph(m, n) - networkx.set_edge_attributes(graph, 1, "shared_perim") + networkx.set_edge_attributes(nx_graph, 1, "shared_perim") if with_diagonals: nw_to_se = [ @@ -169,18 +210,31 @@ def create_grid_graph(dimensions: Tuple[int, int], with_diagonals: bool) -> Grap ((i, j + 1), (i + 1, j)) for i in range(m - 1) for j in range(n - 1) ] diagonal_edges = nw_to_se + sw_to_ne - graph.add_edges_from(diagonal_edges) + #frm: TODO: Check that graph is an NX graph before calling graph.add_edges_from(). Eventually + # make this work for RX too... + nx_graph.add_edges_from(diagonal_edges) for edge in diagonal_edges: - graph.edges[edge]["shared_perim"] = 0 + # frm: TODO: When/if grid.py is converted to operate on GerryChain Graph + # objects instead of NX.Graph objects, this use of NX + # EdgeView to get/set edge data will need to change to use + # gerrychain_graph.edge_data() + # + # We will also need to think about edge vs edge_id. In this + # case we want an edge_id, so that means we need to look at + # how diagonal_edges are created - but that is for the future... + nx_graph.edges[edge]["shared_perim"] = 0 - networkx.set_node_attributes(graph, 1, "population") - networkx.set_node_attributes(graph, 1, "area") + # frm: These just set all nodes/edges in the graph to have the given attributes with a value of 1 + # frm: TODO: These won't work for the new graph, and they won't work for RX + networkx.set_node_attributes(nx_graph, 1, "population") + networkx.set_node_attributes(nx_graph, 1, "area") - tag_boundary_nodes(graph, dimensions) + _tag_boundary_nodes(nx_graph, dimensions) - return graph + return nx_graph +# frm ???: Why is this here instead of in graph.py? Who is it intended for? Internal vs. External? def give_constant_attribute(graph: Graph, attribute: Any, value: Any) -> None: """ Sets the specified attribute to the specified value for all nodes in the graph. @@ -195,10 +249,11 @@ def give_constant_attribute(graph: Graph, attribute: Any, value: Any) -> None: :returns: None """ for node in graph.nodes: - graph.nodes[node][attribute] = value + # frm original code: graph.nodes[node][attribute] = value + graph.node_data(node)[attribute] = value -def tag_boundary_nodes(graph: Graph, dimensions: Tuple[int, int]) -> None: +def _tag_boundary_nodes(nx_graph: networkx.Graph, dimensions: Tuple[int, int]) -> None: """ Adds the boolean attribute ``boundary_node`` to each node in the graph. If the node is on the boundary of the grid, that node also gets the attribute @@ -211,13 +266,30 @@ def tag_boundary_nodes(graph: Graph, dimensions: Tuple[int, int]) -> None: :returns: None """ + # + # frm: Another case of code that is not clear (at least to me). It took me + # a while to figure out that the name/label for a node in a grid graph + # is a tuple and not just a number or string. The tuple indicates its + # position in the grid (x,y) cartesian coordinates, so node[0] below + # means its x-position and node[1] means its y-position. So the if-stmt + # below tests whether a node is all the way on the left or the right or + # all the way on the top or the bottom. If so, it is tagged as a + # boundary node and it gets its boundary_perim value set - still not + # sure what that does/means... + # + # Peter's comment from PR: + # + # I think that being able to identify a boundary edge was needed in some early + # experiments, so it was important to tag them, but I haven't really something + # that cares about this in a while + m, n = dimensions - for node in graph.nodes: + for node in nx_graph.nodes: if node[0] in [0, m - 1] or node[1] in [0, n - 1]: - graph.nodes[node]["boundary_node"] = True - graph.nodes[node]["boundary_perim"] = get_boundary_perim(node, dimensions) + nx_graph.nodes[node]["boundary_node"] = True + nx_graph.nodes[node]["boundary_perim"] = get_boundary_perim(node, dimensions) else: - graph.nodes[node]["boundary_node"] = False + nx_graph.nodes[node]["boundary_node"] = False def get_boundary_perim(node: Tuple[int, int], dimensions: Tuple[int, int]) -> int: diff --git a/gerrychain/metagraph.py b/gerrychain/metagraph.py index 438af206..100131b2 100644 --- a/gerrychain/metagraph.py +++ b/gerrychain/metagraph.py @@ -27,6 +27,15 @@ def all_cut_edge_flips(partition: Partition) -> Iterator[Dict]: :returns: An iterator that yields dictionaries representing the flipped edges. :rtype: Iterator[Dict] """ + # frm: For my own edification... It took me a while to understand why + # this routine made sense at a high level. It finds all edges + # on the boundary of districts - those that are "cut edges" + # where one node is in one district and the other node is in + # another district. These are all of the places where you + # could move the boundary between districts by moving a single + # node. Stated differently, these are all of the places where + # you can make a single flip without creating a disconnected + # graph. for edge, index in product(partition.cut_edges, (0, 1)): yield {edge[index]: partition.assignment.mapping[edge[1 - index]]} diff --git a/gerrychain/metrics/partisan.py b/gerrychain/metrics/partisan.py index ef8be5cc..0be22991 100644 --- a/gerrychain/metrics/partisan.py +++ b/gerrychain/metrics/partisan.py @@ -9,6 +9,7 @@ import numpy from typing import Tuple +# frm: TODO: Why are these not just included in the file that defines ElectionResults? def mean_median(election_results) -> float: """ diff --git a/gerrychain/partition/assignment.py b/gerrychain/partition/assignment.py index b38ca49d..3844399a 100644 --- a/gerrychain/partition/assignment.py +++ b/gerrychain/partition/assignment.py @@ -37,6 +37,7 @@ def __init__( :raises ValueError: if the keys of ``parts`` are not unique :raises TypeError: if the values of ``parts`` are not frozensets """ + if validate: number_of_keys = sum(len(keys) for keys in parts.values()) number_of_unique_keys = len(set().union(*parts.values())) @@ -77,6 +78,14 @@ def update_flows(self, flows): """ Update the assignment for some nodes using the given flows. """ + # frm: Update the assignment of nodes to partitions by adding + # all of the new nodes and removing all of the old nodes + # as represented in the flows (dict keyed by district (part) + # of nodes flowing "in" and "out" for that district). + # + # Also, reset the mapping of node to partition (self.mapping) + # to reassign each node to its new partition. + # for part, flow in flows.items(): # Union between frozenset and set returns an object whose type # matches the object on the left, which here is a frozenset @@ -146,9 +155,64 @@ def from_dict(cls, assignment: Dict) -> "Assignment": passed-in dictionary. :rtype: Assignment """ + + # frm: TODO: Clean up from_dict(). + # + # A couple of things: + # * It uses a routine, level_sets(), which is only ever used here, so + # why bother having a separate routine. All it does is convert a dict + # mapping node_ids to parts into a dict mapping parts into sets of + # node_ids. Why not just have that code here inline? + # + # * Also, the constructor for Assignment explicitly allows for the caller + # to pass in a "mapping" of node_id to part, which we have right here. + # Why don't we pass it in and save having to recompute it? + # + parts = {part: frozenset(keys) for part, keys in level_sets(assignment).items()} return cls(parts) + + def new_assignment_convert_old_node_ids_to_new_node_ids(self, node_id_mapping: Dict) -> "Assignment": + """ + Create a new Assignment object from the one passed in, where the node_ids are changed + according to the node_id_mapping from old node_ids to new node_ids. + + This routine was motivated by the fact that node_ids are changed when converting from an + NetworkX based graph to a RustworkX based graph. An Assignment based on the node_ids in + the NetworkX based graph would need to be changed to use the new node_ids - the new + Asignment would be semantically equivalent - just converted to use the new node_ids in + the RX based graph. + + The node_id_mapping is of the form {old_node_id: new_node_id} + """ + + # Dict of the form: {node_id: part_id} + old_assignment_mapping = self.mapping + old_parts = self.parts + + # convert old_node_ids to new_node_ids, keeping part IDs the same + new_assignment_mapping = { + node_id_mapping[old_node_id]: part + for old_node_id, part in old_assignment_mapping.items() + } + # Now upate the parts dict that has a frozenset of all the nodes in each part (district) + new_parts = {} + for cur_node_id, cur_part in new_assignment_mapping.items(): + if not cur_part in new_parts: + new_parts[cur_part] = set() + new_parts[cur_part].add(cur_node_id) + for cur_part, set_of_nodes in new_parts.items(): + new_parts[cur_part] = frozenset(set_of_nodes) + + # pandas.Series(data=part, index=nodes) for part, nodes in self.parts.items() + + new_assignment = Assignment( + new_parts, + new_assignment_mapping + ) + + return new_assignment def get_assignment( @@ -174,13 +238,23 @@ def get_assignment( is not provided. :raises TypeError: If the part_assignment is not a string or dictionary. """ + + # frm: TODO: Think about whether to split this into two functions. AT + # present, it does different things based on whether + # the "part_assignment" parameter is a string, a dict, + # or an assignment. Probably not worth the trouble (possible + # legacy issues), but I just can't get used to the Python habit + # of weak typing... + if isinstance(part_assignment, str): + # Extract an assignment using the named node attribute if graph is None: raise TypeError( "You must provide a graph when using a node attribute for the part_assignment" ) return Assignment.from_dict( - {node: graph.nodes[node][part_assignment] for node in graph} + # frm: original code: {node: graph.nodes[node][part_assignment] for node in graph} + {node: graph.node_data(node)[part_assignment] for node in graph} ) # Check if assignment is a dict or a mapping type elif callable(getattr(part_assignment, "items", None)): diff --git a/gerrychain/partition/partition.py b/gerrychain/partition/partition.py index 9f484f61..cbb933d7 100644 --- a/gerrychain/partition/partition.py +++ b/gerrychain/partition/partition.py @@ -1,5 +1,13 @@ import json -import networkx + + +# frm: Only used in _first_time() inside __init__() to allow for creating +# a Partition from a NetworkX Graph object: +# +# elif isinstance(graph, networkx.Graph): +# graph = Graph.from_networkx(graph) +# self.graph = FrozenGraph(graph) +import networkx from gerrychain.graph.graph import FrozenGraph, Graph from ..updaters import compute_edge_flows, flows_from_changes, cut_edges @@ -8,6 +16,18 @@ from ..tree import recursive_tree_part from typing import Any, Callable, Dict, Optional, Tuple +# frm TODO: Add documentation about how this all works. For instance, +# what is computationally expensive and how does a FrozenGraph +# help? Why do we need both assignments and parts? +# +# Since a Partition is intimately tied up with how the Markov Chain +# does its magic, it would make sense to talk about that a bit... +# +# For instance, is there any reason to use a Partition object +# except in a Markov Chain? I suppose they are useful for post +# Markov Chain analysis - but if so, then it would be nice to +# know what functionality is tuned for the Markov Chain and what +# functionality / data is tuned for post Markov Chain analysis. class Partition: """ @@ -56,12 +76,24 @@ def __init__( which the functions compute. :param use_default_updaters: If `False`, do not include default updaters. """ + if parent is None: + if graph is None: + raise Exception("Parition.__init__(): graph object is None") + self._first_time(graph, assignment, updaters, use_default_updaters) else: self._from_parent(parent, flips) self._cache = dict() + + #frm: SubgraphView provides cached access to subgraphs for each of the + # partition's districts. It is important that we asign subgraphs AFTER + # we have established what nodes belong to which parts (districts). In + # the case when the parent is None, the assignments are explicitly provided, + # and in the case when there is a parent, the _from_parent() logic processes + # the flips to update the assignments. + self.subgraphs = SubgraphView(self.graph, self.parts) @classmethod @@ -101,7 +133,10 @@ def from_random_assignment( :returns: The partition created with a random assignment :rtype: Partition """ - total_pop = sum(graph.nodes[n][pop_col] for n in graph) + # frm: ???: BUG: TODO: The param, flips, is never used in this routine... + + # frm: original code: total_pop = sum(graph.nodes[n][pop_col] for n in graph) + total_pop = sum(graph.node_data(n)[pop_col] for n in graph) ideal_pop = total_pop / n_parts assignment = method( @@ -120,18 +155,67 @@ def from_random_assignment( ) def _first_time(self, graph, assignment, updaters, use_default_updaters): - if isinstance(graph, Graph): - self.graph = FrozenGraph(graph) - elif isinstance(graph, networkx.Graph): + # Make sure that the embedded graph for the Partition is based on + # a RustworkX graph, and make sure it is also a FrozenGraph. Both + # of these are important for performance. + + # frm: TODO: Do we want to continue to allow users to create a Partition + # directly from an NX graph? I suppose there is no harm... + # + # The answer is YES - creating and manipulating NX Graphs is easy and users + # are familiar with doing so. It makes sense to preserve the use case of + # creating an NX-Graph and then allowing the code to under-the-covers + # convert to RX - both for legacy compatibility, but also because NX provides + # a really nice and easy way to create graphs. + # + # So this TODO should be interpreted as a todo-item to update the documentation + # to describe the use case of creating a graph using NX. That documentation + # should also describe how to post-process results of a MarkovChain run + # but I haven't figured that out yet... + + # If a NX.Graph, create a Graph object based on NX + if isinstance(graph, networkx.Graph): graph = Graph.from_networkx(graph) + + # if a Graph object, make sure it is based on an embedded RustworkX.PyGraph + if isinstance(graph, Graph): + if (graph.is_nx_graph()): + + # Get the assignment that would be appropriate for the NX-based graph + old_nx_assignment = get_assignment(assignment, graph) + + # Convert the NX graph to be an RX graph + graph = graph.convert_from_nx_to_rx() + + # After converting from NX to RX, we need to update the Partition's assignment + # because it used the old NX node_ids (converting to RX changes node_ids) + nx_to_rx_node_id_map = graph.get_nx_to_rx_node_id_map() + new_rx_assignment = old_nx_assignment.new_assignment_convert_old_node_ids_to_new_node_ids( + nx_to_rx_node_id_map + ) + self.assignment = new_rx_assignment + + # We also have to update the _node_id_to_original_node_id_map to refer to the node_ids + # in the NX Graph object. + _node_id_to_original_node_id_map = {} + for node_id in graph.nodes: + original_node_id = graph.node_data(node_id)["__networkx_node__"] + _node_id_to_original_node_id_map[node_id] = original_node_id + graph._node_id_to_original_node_id_map = _node_id_to_original_node_id_map + + else: + self.assignment = get_assignment(assignment, graph) + self.graph = FrozenGraph(graph) + elif isinstance(graph, FrozenGraph): + # frm: TODO: Verify that the embedded graph is RX self.graph = graph + self.assignment = get_assignment(assignment, graph) + else: raise TypeError(f"Unsupported Graph object with type {type(graph)}") - self.assignment = get_assignment(assignment, graph) - if set(self.assignment) != set(graph): raise KeyError("The graph's node labels do not match the Assignment's keys") @@ -145,11 +229,25 @@ def _first_time(self, graph, assignment, updaters, use_default_updaters): self.updaters.update(updaters) + # Note that the updater functions are executed lazily - that is, only when + # a caller asks for the results, such as partition["perimeter"]. See the code + # for __getitem__(). + # + # So no need to execute the updater functions now... + self.parent = None self.flips = None self.flows = None self.edge_flows = None + # frm ???: This is only called once and it is tagged as an internal + # function (leading underscore). Is there a good reason + # why this is not internal to the __init__() routine + # where it is used? + # + # That is, is there any reason why anyone might ever + # call this except __init__()? + def _from_parent(self, parent: "Partition", flips: Dict) -> None: self.parent = parent self.flips = flips @@ -173,7 +271,7 @@ def __repr__(self): def __len__(self): return len(self.parts) - def flip(self, flips: Dict) -> "Partition": + def flip(self, flips: Dict, use_original_node_ids=False) -> "Partition": """ Returns the new partition obtained by performing the given `flips` on this partition. @@ -182,6 +280,29 @@ def flip(self, flips: Dict) -> "Partition": :returns: the new :class:`Partition` :rtype: Partition """ + + # frm: TODO: Change comments above to document new optional parameter, use_original_node_ids. + # + # This is a new issue that arises from the fact that node_ids in RX are different from those + # in the original NX graph. In the pre-RX code, we did not need to distinguish between + # calls to flip() that were internal code used when doing a MarkovChain versus user code + # for instance in tests. However, in the new RX world, the internal code uses RX node_ids + # and the tests want to use "original" NX node_ids. Hence the new parameter. + + # If the caller identified flips in terms of "original" node_ids (typically node_ids associated with + # an NX-based graph before creating a Partition object), then translate those original node_ids + # into the appropriate internal RX-based node_ids. + # + # Note that original node_ids in flips are typically used in tests + # + + if use_original_node_ids: + new_flips = {} + for original_node_id, part in flips.items(): + internal_node_id = self.graph.internal_node_id_for_original_node_id(original_node_id) + new_flips[internal_node_id] = part + flips = new_flips + return self.__class__(parent=self, flips=flips) def crosses_parts(self, edge: Tuple) -> bool: @@ -205,11 +326,44 @@ def __getitem__(self, key: str) -> Any: :returns: The value of the updater. :rtype: Any """ + # frm: Cleverness Alert: Delayed evaluation of updater functions... + # + # The code immediately below executes the appropriate updater function + # if it has not already been executed and then caches the results. + # This makes sense - why compute something if nobody ever wants it, + # but it took me a while to figure out why the constructor did not + # explicitly call the updaters. + # + if key not in self._cache: + # frm: TODO: Add code to check that the desired updater actually is + # defined in the list of updaters. If not, then this + # would produce a perhaps difficult to debug problem... self._cache[key] = self.updaters[key](self) return self._cache[key] def __getattr__(self, key): + # frm TODO: Not sure it makes sense to allow two ways to accomplish the same thing... + # + # The code below allows Partition users to get the results of updaters by just + # doing: partition. which is the same as doing: partition[""] + # It is clever, but perhaps too clever. Why provide two ways to do the same thing? + # + # It is also odd on a more general level - this approach means that the attributes of a + # Partition are the same as the names of the updaters and return the results of running + # the updater functions. I guess this makes sense, but there is no documentation (that I + # am aware of) that makes this clear. + # + # Peter's comment in PR: + # + # This is actually on my list of things that I would prefer removed. When I first + # started working with this codebase, I found the fact that you could just do + # partition.name_of_my_updater really confusing, and, from a Python perspective, + # I think that the more intuitive interface is keyword access like in a dictionary. + # I haven't scoured the codebase for instances of ".attr" yet, but this is one of + # the things that I am 100% okay with getting rid of. Almost all of the people + # that I have seen work with this package use the partition["attr"] paradigm anyway. + # return self[key] def keys(self): @@ -220,6 +374,15 @@ def parts(self): return self.assignment.parts def plot(self, geometries=None, **kwargs): + # + # frm ???: I think that this plots districts on a map that is defined + # by the geometries parameter (presumably polygons or something similar). + # It converts the partition data into data that the plot routine + # knows how to deal with, but essentially it just assigns each node + # to a district. the **kwargs are then passed to the plotting + # engine - presumably to define colors and other graph stuff. + # + """ Plot the partition, using the provided geometries. @@ -237,6 +400,8 @@ def plot(self, geometries=None, **kwargs): if geometries is None: geometries = self.graph.geometry + # frm: TODO: Test that self.graph.geometry is not None - but first need to grok + # where this is set (other than Graph.from_geodataframe()) if set(geometries.index) != set(self.graph.nodes): raise TypeError( @@ -285,13 +450,15 @@ def from_districtr_file( id_column_key = districtr_plan["idColumn"]["key"] districtr_assignment = districtr_plan["assignment"] try: - node_to_id = {node: str(graph.nodes[node][id_column_key]) for node in graph} + # frm: original code: node_to_id = {node: str(graph.nodes[node][id_column_key]) for node in graph} + node_to_id = {node: str(graph.node_data(node)[id_column_key]) for node in graph} except KeyError: raise TypeError( "The provided graph is missing the {} column, which is " "needed to match the Districtr assignment to the nodes of the graph." ) + # frm: TODO: NX vs. RX issues: does "node in graph" work for both NX and RX? assignment = {node: districtr_assignment[node_to_id[node]] for node in graph} return cls(graph, assignment, updaters) diff --git a/gerrychain/partition/subgraphs.py b/gerrychain/partition/subgraphs.py index b282a510..e0b5a0ae 100644 --- a/gerrychain/partition/subgraphs.py +++ b/gerrychain/partition/subgraphs.py @@ -1,6 +1,14 @@ from typing import List, Any, Tuple from ..graph import Graph +# frm: ???: TODO: Is this ever actually used by any other code? If so, where and for what? +# YES - it is used as the type of Partition.subgraphs. So, I need to find +# all access to partition.subgraphs and see how it is used. It is also +# used in contiguity.py +# +# This may be an opportunity to encapsulate knowldege of node_indices vs. +# node_names... + class SubgraphView: """ diff --git a/gerrychain/proposals/proposals.py b/gerrychain/proposals/proposals.py index 988c7467..93c2df10 100644 --- a/gerrychain/proposals/proposals.py +++ b/gerrychain/proposals/proposals.py @@ -111,6 +111,8 @@ def slow_reversible_propose_bi(partition: Partition) -> Partition: :rtype: Partition """ + # frm: TODO: Rename x to be edge *sigh*... + b_nodes = {x[0] for x in partition["cut_edges"]}.union( {x[1] for x in partition["cut_edges"]} ) diff --git a/gerrychain/proposals/spectral_proposals.py b/gerrychain/proposals/spectral_proposals.py index 3c213a94..6c687f36 100644 --- a/gerrychain/proposals/spectral_proposals.py +++ b/gerrychain/proposals/spectral_proposals.py @@ -1,57 +1,112 @@ -import networkx as nx +import networkx as nx # frm: only used to get access to laplacian functions... from numpy import linalg as LA import random from ..graph import Graph from ..partition import Partition from typing import Dict, Optional - +# frm: only ever used in this file - but maybe it is used externally? def spectral_cut( - graph: Graph, part_labels: Dict, weight_type: str, lap_type: str + subgraph: Graph, part_labels: Dict, weight_type: str, lap_type: str ) -> Dict: """ Spectral cut function. - Uses the signs of the elements in the Fiedler vector of a graph to + Uses the signs of the elements in the Fiedler vector of a subgraph to partition into two components. - :param graph: The graph to be partitioned. - :type graph: Graph - :param part_labels: The current partition of the graph. + :param subgraph: The subgraph to be partitioned. + :type subgraph: Graph + :param part_labels: The current partition of the subgraph. :type part_labels: Dict :param weight_type: The type of weight to be used in the Laplacian. :type weight_type: str :param lap_type: The type of Laplacian to be used. :type lap_type: str - :returns: A dictionary assigning nodes of the graph to their new districts. + :returns: A dictionary assigning nodes of the subgraph to their new districts. :rtype: Dict """ - nlist = list(graph.nodes()) - n = len(nlist) + # This routine operates on subgraphs, which is important because the node_ids + # in a subgraph are different from the node_ids of the parent graph, so + # the return value's node_ids need to be translated back into the appropriate + # parent node_ids. + + # frm: TODO: Subtle issue here - in NX there is no difference between a node ID + # and a node index (what you use to get a node from a list), but + # in RX there is a difference - which manifests most in subgraphs + # where RX goes ahead and renumbers the nodes in the graph. To + # make subgraphs work, we remember (in a map) what the node "IDs" + # of the parent graph were. + # + # The issue here is what the code wants here. We are in an RX + # world at this point - so maybe it doesn't matter, but worth + # thinking about... + node_list = list(subgraph.nodes) + num_nodes = len(node_list) if weight_type == "random": - for edge in graph.edge_indices: - graph.edges[edge]["weight"] = random.random() - + # assign a random weight to each edge in the subgraph + for edge_id in subgraph.edge_indices: + subgraph.edge_data(edge_id)["weight"] = random.random() + + # frm TODO: NX vs. RX Issue: NYI: normalized_laplacian_matrix() for RX + # + # Note that while the standard laplacian is straight forward mathematically + # the normalized laplacian is a good bit more complicated. However, since + # NetworkX is open source - perhaps we can get permission to just use their + # code to create RX versions... + + # Compute the desired laplacian matrix (convert from sparse to dense) if lap_type == "normalized": - LAP = (nx.normalized_laplacian_matrix(graph)).todense() - + laplacian_matrix = (subgraph.normalized_laplacian_matrix()).todense() else: - LAP = (nx.laplacian_matrix(graph)).todense() + laplacian_matrix = (subgraph.laplacian_matrix()).todense() + + # frm TODO: Add a better explanation for why eigenvectors are useful + # for determining flips. Perhaps just a URL to an article + # somewhere... + # + # I have added comments to describe the nuts and bolts of what is happening, + # but the overall rationale for this code is missing - and it should be here... + - NLMva, NLMve = LA.eigh(LAP) - NFv = NLMve[:, 1] - xNFv = [NFv.item(x) for x in range(n)] + # LA.eigh(laplacian_matrix) call invokes the eigh() function from + # the Numpy LinAlg module which: + # + # "returns the eigenvalues and eigenvectors of a complex Hermitian + # ... or a real symmetrix matrix." + # + # In our case we have a symmetric matrix, so it returns two + # objects - a 1-D numpy array containing the eigenvalues (which we don't + # care about) and a 2-D numpy square matrix of the eigenvectors. + numpy_eigen_values, numpy_eigen_vectors = LA.eigh(laplacian_matrix) - node_color = [xNFv[x] > 0 for x in range(n)] + # Extract an eigenvector as a numpy array + # frm: ???: Not sure why we want just one of them... + numpy_eigen_vector = numpy_eigen_vectors[:, 1] # frm: ??? I think that this is an eigenvector... - clusters = {nlist[x]: part_labels[node_color[x]] for x in range(n)} + # Convert to an array of normal Python numbers (not numpy based) + eigen_vector_array = [numpy_eigen_vector.item(x) for x in range(num_nodes)] - return clusters + # node_color will be True or False depending on whether the value in the + # eigen_vector_array is positive or negative. In the code below, this + # is equivalent to node_color being 1 or 0 (since Python treats True as 1 + # and False as 0) + node_color = [eigen_vector_array[x] > 0 for x in range(num_nodes)] + # Create flips using the node_color to select which part (district) to assign + # to the node. + flips = {node_list[x]: part_labels[node_color[x]] for x in range(num_nodes)} + # translate subgraph node_ids in flips to parent_graph node_ids + translated_flips = subgraph.translate_subgraph_node_ids_for_flips(flips) + + return translated_flips + + +# frm: only ever used in this file - but maybe it is used externally? def spectral_recom( partition: Partition, weight_type: Optional[str] = None, @@ -88,16 +143,23 @@ def spectral_recom( :rtype: Partition """ - edge = random.choice(tuple(partition["cut_edges"])) + # Select two adjacent parts (districts) at random by first selecting + # a cut_edge at random and then figuring out the parts (districts) + # associated with the edge. + cut_edge = random.choice(tuple(partition["cut_edges"])) parts_to_merge = ( - partition.assignment.mapping[edge[0]], - partition.assignment.mapping[edge[1]], + partition.assignment.mapping[cut_edge[0]], + partition.assignment.mapping[cut_edge[1]], ) - subgraph = partition.graph.subgraph( - partition.parts[parts_to_merge[0]] | partition.parts[parts_to_merge[1]] - ) + subgraph_nodes = partition.parts[parts_to_merge[0]] | partition.parts[parts_to_merge[1]] - flips = spectral_cut(subgraph, parts_to_merge, weight_type, lap_type) + # Cut the set of all nodes from parts_to_merge into two hopefully new parts (districts) + flips = spectral_cut( + partition.graph.subgraph(subgraph_nodes), + parts_to_merge, + weight_type, + lap_type + ) return partition.flip(flips) diff --git a/gerrychain/proposals/tree_proposals.py b/gerrychain/proposals/tree_proposals.py index e66a718b..71c68b4f 100644 --- a/gerrychain/proposals/tree_proposals.py +++ b/gerrychain/proposals/tree_proposals.py @@ -7,14 +7,14 @@ epsilon_tree_bipartition, bipartition_tree, bipartition_tree_random, - _bipartition_tree_random_all, + bipartition_tree_random_with_num_cuts, uniform_spanning_tree, find_balanced_edge_cuts_memoization, ReselectException, ) from typing import Callable, Optional, Dict, Union - +# frm: only used in this file class MetagraphError(Exception): """ Raised when the partition we are trying to split is a low degree @@ -24,6 +24,7 @@ class MetagraphError(Exception): pass +# frm: only used in this file class ValueWarning(UserWarning): """ Raised whe a particular value is technically valid, but may @@ -89,6 +90,7 @@ def recom( :type method: Callable, optional :returns: The new partition resulting from the ReCom algorithm. + print("bipartition_tree: updating restarts and attempts") :rtype: Partition """ @@ -99,9 +101,14 @@ def recom( # Try to add the region aware in if the method accepts the surcharge dictionary if "region_surcharge" in signature(method).parameters: method = partial(method, region_surcharge=region_surcharge) - + while len(bad_district_pairs) < tot_pairs: + # frm: In no particular order, try to merge and then split pairs of districts + # that have a cut_edge - meaning that they are adjacent, until you either + # find one that can be split, or you have tried all possible pairs + # of adjacent districts... try: + # frm: TODO: see if there is some way to avoid a while True loop... while True: edge = random.choice(tuple(partition["cut_edges"])) # Need to sort the tuple so that the order is consistent @@ -115,12 +122,11 @@ def recom( if tuple(parts_to_merge) not in bad_district_pairs: break - subgraph = partition.graph.subgraph( - partition.parts[parts_to_merge[0]] | partition.parts[parts_to_merge[1]] - ) + # frm: Note that the vertical bar operator merges the two sets into one set. + subgraph_nodes = partition.parts[parts_to_merge[0]] | partition.parts[parts_to_merge[1]] flips = epsilon_tree_bipartition( - subgraph.graph, + partition.graph.subgraph(subgraph_nodes), parts_to_merge, pop_col=pop_col, pop_target=pop_target, @@ -132,6 +138,7 @@ def recom( except Exception as e: if isinstance(e, ReselectException): + # frm: Add this pair to list of pairs that did not work... bad_district_pairs.add(tuple(parts_to_merge)) continue else: @@ -176,6 +183,7 @@ def reversible_recom( :param balance_edge_fn: The balance edge function. Default is find_balanced_edge_cuts_memoization. :type balance_edge_fn: Callable, optional + frm: it returns a list of Cuts - a named tuple defined in tree.py :param M: The maximum number of balance edges. Default is 1. :type M: int, optional :param repeat_until_valid: Flag indicating whether to repeat until a valid partition is @@ -189,8 +197,11 @@ def reversible_recom( """ def dist_pair_edges(part, a, b): + # frm: Find all edges that cross from district a into district b return set( e + # frm: TODO: edges vs. edge_ids: edges are wanted here (tuples) + # frm: Original Code: for e in part.graph.edges for e in part.graph.edges if ( ( @@ -212,41 +223,106 @@ def bounded_balance_edge_fn(*args, **kwargs): ) return cuts + """ + frm: Original Code: + bipartition_tree_random_reversible = partial( _bipartition_tree_random_all, repeat_until_valid=repeat_until_valid, spanning_tree_fn=uniform_spanning_tree, balance_edge_fn=bounded_balance_edge_fn, ) + + I deemed this code to be evil, if only because it used an internal tree.py routine + _bipartition_tree_random_all(). This internal routine returns a set of Cut objects + which otherwise never appear outside tree.py, so this just adds complexity. + + The only reason the original code used _bipartition_tree_random_all() instead of just + using bipartition_tree_random() is that it needs to know how many possible new + districts there are. So, I created a new function in tree.py that does EXACTLY + what bipartition_tree_random() does but which also returns the number of possible + new districts. + + """ + bipartition_tree_random_reversible = partial( + bipartition_tree_random_with_num_cuts, + repeat_until_valid=repeat_until_valid, + spanning_tree_fn=uniform_spanning_tree, + balance_edge_fn=bounded_balance_edge_fn, + ) parts = sorted(list(partition.parts.keys())) dist_pairs = [] for out_part in parts: for in_part in parts: dist_pairs.append((out_part, in_part)) + # frm: TODO: Grok why this code considers pairs that are the same part... + # + # For instance, if there are only two parts (districts), then this code will + # produce four pairs: (0,0), (0,1), (1,0), (1,1). The code below tests + # to see if there is any adjacency, but there will never be adjacency between + # the same part (district). Why not just prune out all pairs that have the + # same two values and save an interation of the entire chain? + # + # Stated differently, is there any value in doing an entire chain iteration + # when we randomly select the same part (district) to merge with itself??? + # + # A similar issue comes up if there are no pair_edges (below). We waste + # an entire iteration in that case too - which seems kind of dumb... + # random_pair = random.choice(dist_pairs) pair_edges = dist_pair_edges(partition, *random_pair) if random_pair[0] == random_pair[1] or not pair_edges: return partition # self-loop: no adjacency + # frm: TODO: Grok why it is OK to return the partition unchanged as the next step. + # + # This runs the risk of running an entire chain without ever changing the partition. + # I assume that the logic is that there is deliberate randomness introduced each time, + # so eventually, if it is possible, the chain will get started, but it seems like there + # should be some kind of check to see if it doesn't ever get started, so that the + # user can have a clue about what is going on... + edge = random.choice(list(pair_edges)) parts_to_merge = ( partition.assignment.mapping[edge[0]], partition.assignment.mapping[edge[1]], ) - subgraph = partition.graph.subgraph( - partition.parts[parts_to_merge[0]] | partition.parts[parts_to_merge[1]] - ) - - all_cuts = bipartition_tree_random_reversible( - subgraph, pop_col=pop_col, pop_target=pop_target, epsilon=epsilon + # Remember node_ids from which subgraph was created - we will need them below + subgraph_nodes = partition.parts[parts_to_merge[0]] | partition.parts[parts_to_merge[1]] + + # frm: Note: This code has changed to make sure we don't access subgraph node_ids. + # The former code saved the subgraph and used its nodes to compute + # the remaining_nodes, but this doesn't work with RX, because the + # node_ids for the subgraph are different from those in the parent graph. + # The solution is to just remember the parent node_ids that were used + # to create the subgraph, and to move the subgraph call in as an actual + # parameter, so that after the call there is no way to reference it. + # + # Going forward, this should be a coding style - only invoke Graph.subgraph() + # as an actual parameter so that there is no way to inadvertently access + # the subgraph's node_ids afterwards. + # + + # frm: TODO: Clean up the code below - I munged it for debugging ... + + # frm: Original Code: + # num_possible_districts, nodes = bipartition_tree_random_reversible( + # partition.graph.subgraph(subgraph_nodes), + # pop_col=pop_col, pop_target=pop_target, epsilon=epsilon + # ) + result = bipartition_tree_random_reversible( + partition.graph.subgraph(subgraph_nodes), + pop_col=pop_col, pop_target=pop_target, epsilon=epsilon ) - if not all_cuts: + if not result: return partition # self-loop: no balance edge - nodes = choice(all_cuts).subset - remaining_nodes = set(subgraph.nodes()) - set(nodes) + num_possible_districts, nodes = result + + remaining_nodes = subgraph_nodes - set(nodes) + # frm: Notes to Self: the ** operator below merges the two dicts into a single dict. flips = { **{node: parts_to_merge[0] for node in nodes}, **{node: parts_to_merge[1] for node in remaining_nodes}, @@ -255,7 +331,7 @@ def bounded_balance_edge_fn(*args, **kwargs): new_part = partition.flip(flips) seam_length = len(dist_pair_edges(new_part, *random_pair)) - prob = len(all_cuts) / (M * seam_length) + prob = num_possible_districts / (M * seam_length) if prob > 1: raise ReversibilityError( f"Found {len(all_cuts)} balance edges, but " @@ -267,6 +343,24 @@ def bounded_balance_edge_fn(*args, **kwargs): return partition # self-loop +# frm TODO: I do not think that ReCom() is ever called. Note that it +# only defines a constructor and a __call__() which would allow +# you to call the recom() function by creating a ReCom object and then +# "calling" that object - why not just call the recom function? +# +# ...confused... +# +# My guess is that someone started writing this code thinking that +# a class would make sense but then realized that the only use +# was to call the recom() function but never went back to remove +# the class. In short, I think that we should probably remove the +# class and just keep the function... +# +# What Peter said in a PR: +# +# Another bit of legacy code. I am also not sure why this exists. Seems like +# there were plans for this and then it got dropped when someone graduated +# class ReCom: """ ReCom (short for ReCombination) is a class that represents a ReCom proposal diff --git a/gerrychain/tree.py b/gerrychain/tree.py index 06ce8433..2732f0c5 100644 --- a/gerrychain/tree.py +++ b/gerrychain/tree.py @@ -12,7 +12,7 @@ and methods for assessing and modifying this data. - Functions for finding balanced edge cuts in a populated graph, either through contraction or memoization techniques. -- A suite of functions (`bipartition_tree`, `recursive_tree_part`, `get_seed_chunks`, etc.) +- A suite of functions (`bipartition_tree`, `recursive_tree_part`, `_get_seed_chunks`, etc.) for partitioning graphs into balanced subsets based on population targets and tolerances. - Utility functions like `get_max_prime_factor_less_than` and `recursive_seed_part_inner` to assist in complex partitioning tasks. @@ -25,9 +25,62 @@ Last Updated: 25 April 2024 """ +# frm: This file, tree.py, needed to be modified to operate on new Graph +# objects instead of NetworkX Graph objects because the routines are +# used by the Graph objects inside a Partion, which will soon be based +# on RustworkX. More specifically, these routines are used by Proposals, +# and we will soon switch to having the underlying Graph object used +# in Partitions and Proposals be based on RustworkX. +# +# It may be the case that they are ONLY ever used by Proposals and +# hence could just have been rewritten to operate on RustworkX Graph +# objects, but there seemed to be no harm in having them work either +# way. It was also a good proving ground for testing whether the new +# Graph object could behave like a NetworkX Graph object (in terms of +# attribute access and syntax). + +""" +frm: RX Documentation + +Many of the functions in this file operate on subgraphs which are different from +NX subgraphs because the node_ids change in the subgraph. To deal with this, +in graph.py we have a _node_id_to_parent_node_id_map data member for Graph objects which maps +the node_ids in a subgraph to the corresponding node_id in its parent graph. This +will allow routines operating on subgraphs to return results using the node_ids +of the parent graph. + +Note that for top-level graphs, we still define this _node_id_to_parent_node_id_map, but in +this case it is an identity map that just maps each node_id to itself. This allows +code to always translate correctly, even if operating on a top-level graph. + +As a matter of coding convention, all calls to graph.subgraph() have been placed +in the actual parameter list of function calls. This limits the scope of the +subgraph node_ids to the called function - eliminating the risk of those node_ids +leaking into surrounding code. Stated differently, this eliminates the cognitive +load of trying to remember whether a node_id is a parent or a subgraph node_id. +""" import networkx as nx -from networkx.algorithms import tree +import rustworkx as rx +import numpy as np +from scipy.sparse import csr_array +# frm TODO: Remove import of networkx and rustworkx once we have moved networkx +# dependencies out of this file - see comments below on +# spanning trees. + +import networkx.algorithms.tree as nxtree +# frm TODO: Remove import of "tree" from networkx.algorithms in this file +# It is only used to get a spanning tree function: +# +# spanning_tree = nxtree.minimum_spanning_tree( +# +# There is an RX function that also computes a spanning tree - hopefully +# it works as we want it to work and hence can be used. +# +# I think it probably makes sense to move this spanning tree function +# into graph.py and to encapsulate the NX vs RX code there. +# +# Note Peter agrees with this... from functools import partial from inspect import signature @@ -48,18 +101,15 @@ ) import warnings +# frm: import the new Graph object which encapsulates NX and RX Graph... +from .graph import Graph -def predecessors(h: nx.Graph, root: Any) -> Dict: - return {a: b for a, b in nx.bfs_predecessors(h, root)} - - -def successors(h: nx.Graph, root: Any) -> Dict: - return {a: b for a, b in nx.bfs_successors(h, root)} - +# frm TODO: Update function param docmentation to get rid of nx.Graph and use just Graph def random_spanning_tree( - graph: nx.Graph, region_surcharge: Optional[Dict] = None -) -> nx.Graph: + graph: Graph, # frm: Original code: graph: x.Graph, + region_surcharge: Optional[Dict] = None +) -> Graph: # frm: Original code: ) -> nx.Graph: """ Builds a spanning tree chosen by Kruskal's method using random weights. @@ -72,31 +122,137 @@ def random_spanning_tree( :returns: The maximal spanning tree represented as a Networkx Graph. :rtype: nx.Graph """ + # frm: ???: + # This seems to me to be an expensive way to build a random spanning + # tree. It calls a routine to compute a "minimal" spanning tree that + # computes the total "weight" of the spanning tree and selects the + # minmal total weight. By making the weights random, this will select + # a different spanning tree each time. This works, but it does not + # in any way depend on the optimization. + # + # Why isn't the uniform_spanning_tree() below adequate? It takes + # a random walk at each point to create the spanning tree. This + # would seem to be a much cheaper way to calculate a spanning tree. + # + # What am I missing??? + + """ + frm: RX Documentation: + + As far as I can tell a spanning tree is only ever used to populate a PopulatedGraph + and so, there is no need to worry about translating the spanning tree's nodes into + the context of the parent. Stated differently, a spanning tree is not used to + compute something about a subgraph but rather to compute something about whatever + graph is currently being dealt with. + + In short, I am assuming that we can ignore the fact that RX subgraphs have different + node_ids for this function and all will be well... + """ + + # frm: TODO: WTF is up with region_surcharge being unset? The region_surcharge + # is only ever accessed in this routine in the for-loop below to + # increase the weight on the edge - setting it to be an empty dict + # just prevents the code below from blowing up. Why not just put + # a test for the surcharge for-loop alone: + # + # if not region_surcharge is None: + # for key, value in region_surcharge.items(): + # ... + # + # Peter's comments from PR: + # + # peterrrock2 last week + # This is one of mine. I added the region surcharge stuff in an afternoon, + # so I probably did this to prevent the more than 3 levels of indentation + # and to make the reasoning easier to track as I was adding the feature. + # + # Collaborator + # Author + # @peterrrock2 peterrrock2 last week + # Also, I imagine that I originally wanted the function modification to look like + # + # def random_spanning_tree( + # graph: Graph, + # region_surcharge: dict = dict() + # ) -> Graph: + # + # but doing this sort of thing is generally a bad idea in python since the + # dict() is instantiated at import time and then all future calls to the + # function reference the same dict when the surcharge is unset. Not a problem + # for this function, but the accepted best-practice is to change the above to + # + # def random_spanning_tree( + # graph: Graph, + # region_surcharge: Optional[Dict] = None + # ) -> Graph: + # if region_surcharge is None: + # region_surcharge = dict() + # + # since this doesn't reuse the reference. + + + + + if region_surcharge is None: region_surcharge = dict() - for edge in graph.edges(): + # frm: Original Code: for edge in graph.edges(): + # Changed because in RX edge_ids are integers while edges are tuples + + for edge_id in graph.edge_indices: + edge = graph.get_edge_from_edge_id(edge_id) weight = random.random() for key, value in region_surcharge.items(): # We surcharge edges that cross regions and those that are not in any region if ( - graph.nodes[edge[0]][key] != graph.nodes[edge[1]][key] - or graph.nodes[edge[0]][key] is None - or graph.nodes[edge[1]][key] is None + # frm: original code: graph.nodes[edge[0]][key] != graph.nodes[edge[1]][key] + # frm: original code: or graph.nodes[edge[0]][key] is None + # frm: original code: or graph.nodes[edge[1]][key] is None + graph.node_data(edge[0])[key] != graph.node_data(edge[1])[key] + or graph.node_data(edge[0])[key] is None + or graph.node_data(edge[1])[key] is None ): weight += value - graph.edges[edge]["random_weight"] = weight + # frm: Original Code: graph.edges[edge]["random_weight"] = weight + graph.edge_data(edge_id)["random_weight"] = weight - spanning_tree = tree.minimum_spanning_tree( - graph, algorithm="kruskal", weight="random_weight" - ) - return spanning_tree + # frm: CROCK: (for the moment) + # We need to create a minimum spanning tree but the way to do so + # is different for NX and RX. I am sure that there is a more elegant + # way to do this, and in any event, this dependence on NX vs RX + # should not be in this file, tree.py, but for now, I am just trying + # to get this to work, so I am using CROCKS... + + graph.verify_graph_is_valid() + + # frm: TODO: Remove NX / RX dependency - maybe move to graph.py + + if (graph.is_nx_graph()): + nx_graph = graph.get_nx_graph() + spanning_tree = nxtree.minimum_spanning_tree( + nx_graph, algorithm="kruskal", weight="random_weight" + ) + spanningGraph = Graph.from_networkx(spanning_tree) + elif (graph.is_rx_graph()): + rx_graph = graph.get_rx_graph() + def get_weight(edge_data): + # function to get the weight of an edge from its data + # This function is passed a dict with the data for the edge. + return edge_data["random_weight"] + spanning_tree = rx.minimum_spanning_tree(rx_graph, get_weight) + spanningGraph = Graph.from_rustworkx(spanning_tree) + else: + raise Exception("random_spanning_tree - bad kind of graph object") + return spanningGraph def uniform_spanning_tree( - graph: nx.Graph, choice: Callable = random.choice -) -> nx.Graph: + # frm: Original code: graph: nx.Graph, choice: Callable = random.choice + graph: Graph, + choice: Callable = random.choice +) -> Graph: """ Builds a spanning tree chosen uniformly from the space of all spanning trees of the graph. Uses Wilson's algorithm. @@ -109,29 +265,63 @@ def uniform_spanning_tree( :returns: A spanning tree of the graph chosen uniformly at random. :rtype: nx.Graph """ - root = choice(list(graph.node_indices)) - tree_nodes = set([root]) - next_node = {root: None} - - for node in graph.node_indices: - u = node + + """ + frm: RX Docmentation: + + As with random_spanning_tree, I am assuming that the issue of RX subgraphs having + different node_ids is not an issue for this routine... + """ + # Pick a starting point at random + root_id = choice(list(graph.node_indices)) + tree_nodes = set([root_id]) + next_node_id = {root_id: None} + + # frm: I think that this builds a tree bottom up. It takes + # every node in the graph (in sequence). If the node + # is already in the list of nodes that have been seen + # which means it has a neighbor registered as a next_node, + # then it is skipped. If this node does not yet have + # a neighbor registered, then it is given one, and + # that neighbor becomes the next node looked at. + # + # This essentially takes a node and travels "up" until + # it finds a node that is already in the tree. Multiple + # nodes can end up with the same "next_node" - which + # in tree-speak means that next_node is the parent of + # all of the nodes that end on it. + + for node_id in graph.node_indices: + u = node_id while u not in tree_nodes: - next_node[u] = choice(list(graph.neighbors(u))) - u = next_node[u] + next_node_id[u] = choice(list(graph.neighbors(u))) + u = next_node_id[u] - u = node + u = node_id while u not in tree_nodes: tree_nodes.add(u) - u = next_node[u] + u = next_node_id[u] + + # frm DONE: To support RX, I added an add_edge() method to Graph. + + # frm: TODO: Remove dependency on NX below - G = nx.Graph() - for node in tree_nodes: - if next_node[node] is not None: - G.add_edge(node, next_node[node]) + # frm: Original code: G = nx.Graph() + nx_graph = nx.Graph() + G = Graph.from_networkx(nx_graph) + + for node_id in tree_nodes: + if next_node_id[node_id] is not None: + G.add_edge(node_id, next_node_id[node_id]) return G +# frm TODO +# +# I think that this is only ever used inside this module (except) +# for testing. +# class PopulatedGraph: """ A class representing a graph with population information. @@ -153,7 +343,7 @@ class PopulatedGraph: def __init__( self, - graph: nx.Graph, + graph: Graph, # frm: Original code: graph: nx.Graph, populations: Dict, ideal_pop: Union[float, int], epsilon: float, @@ -177,17 +367,37 @@ def __init__( self.epsilon = epsilon self._degrees = {node: graph.degree(node) for node in graph.nodes} + # frm TODO: Verify that this does the right thing for the new Graph object def __iter__(self): return iter(self.graph) def degree(self, node) -> int: return self._degrees[node] + # frm: only ever used inside this file + # But maybe this is intended to be used externally... + # + # In PR: Peter said: + # + # We do use this external to the class when calling find_balance_edge_cuts_contraction + + + # frm: ???: What the fat does this do? Start with what a population is. It + # appears to be indexed by node. Also, what is a subset? GRRRR... def contract_node(self, node, parent) -> None: + # frm: ???: TODO: This routine is only used once, so why have a separate + # routine - why not just include this code inline where + # the function is now called? It would be simpler to read + # inline than having to go find this definition. + # + # Perhaps it is of use externally, but that seems doubtful... + self.population[parent] += self.population[node] self.subsets[parent] |= self.subsets[node] self._degrees[parent] -= 1 + # frm: only ever used inside this file + # But maybe this is intended to be used externally... def has_ideal_population(self, node, one_sided_cut: bool = False) -> bool: """ Checks if a node has an ideal population within the graph up to epsilon. @@ -204,6 +414,21 @@ def has_ideal_population(self, node, one_sided_cut: bool = False) -> bool: :returns: True if the node has an ideal population within the graph up to epsilon. :rtype: bool """ + + # frm: ???: TODO: this logic is repeated several times in this file. Consider + # refactoring the code so that the logic lives in exactly + # one place. + # + # When thinking about refactoring, consider whether it makes + # sense to toggle what this routine does by the "one_sided_cut" + # parameter. Why not have two separate routines with + # similar but distinguishing names. I need to be absolutely + # clear about what the two cases are all about, but my current + # hypothesis is that when one_sided_cut == False, we are looking + # for the edge which when cut produces two districts of + # approximately equal size - so a bisect rather than a find all + # meaning... + if one_sided_cut: return ( abs(self.population[node] - self.ideal_pop) @@ -229,6 +454,9 @@ def __repr__(self) -> str: ) +# frm: ???: Is a Cut used anywhere outside this file? + +# Definition of Cut namedtuple # Tuple that is used in the find_balanced_edge_cuts function Cut = namedtuple("Cut", "edge weight subset") Cut.__new__.__defaults__ = (None, None, None) @@ -239,6 +467,24 @@ def __repr__(self) -> str: "The (frozen) subset of nodes on one side of the cut. Defaults to None." ) +# frm: TODO: Not sure how this is used, and so I do not know whether it needs +# to translate node_ids to the parent_node_id context. I am assuming not... +# +# Here is an example of how it is used (in test_tree.py): +# +# method=partial( +# bipartition_tree, +# max_attempts=10000, +# balance_edge_fn=find_balanced_edge_cuts_contraction, +# +# and another in the same test file: +# +# populated_tree = PopulatedGraph( +# tree, {node: 1 for node in tree}, len(tree) / 2, 0.5 +# ) +# cuts = find_balanced_edge_cuts_contraction(populated_tree) + + def find_balanced_edge_cuts_contraction( h: PopulatedGraph, one_sided_cut: bool = False, choice: Callable = random.choice @@ -263,25 +509,55 @@ def find_balanced_edge_cuts_contraction( root = choice([x for x in h if h.degree(x) > 1]) # BFS predecessors for iteratively contracting leaves - pred = predecessors(h.graph, root) + # frm: Original code: pred = predecessors(h.graph, root) + pred = h.graph.predecessors(root) cuts = [] + + # frm: Work up from leaf nodes to find subtrees with the "correct" + # population. The algorighm starts with real leaf nodes, but + # if a node does not have the "correct" population, then that + # node is merged (contracted) into its parent, effectively + # creating another leaf node which is then added to the end + # of the queue. + # + # In this way, we calculate the total population of subtrees + # by going bottom up, until we find a subtree that has the + # "correct" population for a cut. + + # frm: ??? Note that there is at least one other routine in this file + # that does something similar (perhaps exactly the same). + # Need to figure out why there are more than one way to do this... + leaves = deque(x for x in h if h.degree(x) == 1) while len(leaves) > 0: leaf = leaves.popleft() if h.has_ideal_population(leaf, one_sided_cut=one_sided_cut): + # frm: If the population of the subtree rooted in this node is the correct + # size, then add it to the cut list. Note that if one_sided_cut == False, + # then the cut means the cut bisects the partition (frm: ??? need to verify this). e = (leaf, pred[leaf]) cuts.append( Cut( edge=e, - weight=h.graph.edges[e].get("random_weight", random.random()), + # frm: Original Code: weight=h.graph.edges[e].get("random_weight", random.random()), + # frm: TODO: edges vs. edge_ids: edge_ids are wanted here (integers) + weight=h.graph.edge_data( + h.graph.get_edge_id_from_edge(e) + ).get("random_weight", random.random()), subset=frozenset(h.subsets[leaf].copy()), ) ) - # Contract the leaf: + # Contract the leaf: frm: merge the leaf's population into the parent and add the parent to "leaves" parent = pred[leaf] + # frm: Add child population and subsets to parent, reduce parent's degree by 1 + # This effectively removes the leaf from the tree, adding all of its data + # to the parent. h.contract_node(leaf, parent) if h.degree(parent) == 1 and parent != root: + # frm: Only add the parent to the end of the queue when we are merging + # the last leaf - this makes sure we only add the parent node to + # the queue one time... leaves.append(parent) return cuts @@ -301,6 +577,18 @@ def _calc_pops(succ, root, h): :returns: A dictionary mapping nodes to their subtree populations. :rtype: Dict """ + # frm: This took me a while to sort out what was going on. + # Conceptually it is easy - given a tree anchored in a root node, + # calculate the population in each subtree going bottom-up. + # The stack (deque) provides the mechanism for going bottom-up. + # On the way down, you just put nodes in the stack (append is like + # push() which seems odd to me, but whatever...) then on the way back + # up, you add the totals for each child to your own population and + # presto you have the total population for each subtree... + # + # For this to work, you just need to have a list of nodes with + # their successors associated with them... + # subtree_pops: Dict[Any, Union[int, float]] = {} stack = deque(n for n in succ[root]) while stack: @@ -322,6 +610,7 @@ def _calc_pops(succ, root, h): return subtree_pops +# frm: Only used in one function and only in this module... def _part_nodes(start, succ): """ Partitions the nodes of a graph into two sets. @@ -335,6 +624,39 @@ def _part_nodes(start, succ): :returns: A set of nodes for a particular district (only one side of the cut). :rtype: Set """ + + """ + frm: Compute the nodes in a subtree defined by a Cut. + + This routine computes the set of nodes in a subtree rooted in the + node identified by "start" in the tree defined by "succ". + + As such it is highly dependent on context and is not generally + useful. That is, it is essentially just a way to refactor some + code used in a couple of places so that the logic in the code is + in one place instead of several. + + To be specific, Cuts are always relative to a specific tree for + a partition. This tree is a "spanning tree" that converts the + graph into a DAG. Cuts are then computed by finding subtrees + of that DAG that have the appropriate population (this could + presumably be modified to include other factors). + + When a Cut is created, we want to collect all of the nodes that + are in the subtree, and this is what this routine does. It + merely starts at the root of the subtree (start) and goes down + the subtree, adding each node to a set. + + frm: ???: TODO: Rename this to be more descriptive - perhaps ] + something like: _nodes_in_subtree() or + _nodes_for_cut() + + frm: TODO: Add the above explanation for what a Cut is and how + we find them by converting the graph to a DAG and + then looking for subtrees to a block header at the + top of this file. It will give the reader some + idea wtf is going on... ;-) + """ nodes = set() queue = deque([start]) while queue: @@ -347,7 +669,7 @@ def _part_nodes(start, succ): queue.append(c) return nodes - +#frm: used externally by tree_proposals.py def find_balanced_edge_cuts_memoization( h: PopulatedGraph, one_sided_cut: bool = False, choice: Callable = random.choice ) -> List[Cut]: @@ -373,12 +695,40 @@ def find_balanced_edge_cuts_memoization( :returns: A list of balanced edge cuts. :rtype: List[Cut] """ + + """ + frm: ???: confused... + + This function seems to be used for two very different purposes, depending on the + value of the parameter, one_sided_cut. When true, the code looks for lots of cuts + that would create a district with the right population - both above and below the + node being considered. Given that it is operating on a tree, one would assume that + there is only one (or perhaps two if one node's population was tiny) cut for the top + of the tree, but there should be many for the bottom of the tree. + + However, if the paramter is set to false (the default), then the code checks to see + whether a cut would produce two districts - on above and one below the tree that + have the right populations. In this case, the code is presumatly looking for the + single node (again there might be two if one node's population was way below epsilon) + that would bisect the graph into two districts with a tolerable population. + + If I am correct, then there is an opportunity to clarify these two uses - perhaps + with wrapper functions. I am also a bit surprised that snippets of code are repeated. + Again - this causes mental load for the reader, and it is an opportunity for bugs to + creep in later (you fix it in one place but not the other). Not sure this "clarification" + is desired, but it is worth considering... + """ + + # frm: ???: Why does a root have to have degree > 1? I would think that any node would do... root = choice([x for x in h if h.degree(x) > 1]) - pred = predecessors(h.graph, root) - succ = successors(h.graph, root) + # frm: Original code: pred = predecessors(h.graph, root) + # frm: Original code: succ = successors(h.graph, root) + pred = h.graph.predecessors(root) + succ = h.graph.successors(root) total_pop = h.tot_pop + # Calculate the population of each subtree in the "succ" tree subtree_pops = _calc_pops(succ, root, h) cuts = [] @@ -386,29 +736,56 @@ def find_balanced_edge_cuts_memoization( if one_sided_cut: for node, tree_pop in subtree_pops.items(): if abs(tree_pop - h.ideal_pop) <= h.ideal_pop * h.epsilon: - e = (node, pred[node]) + # frm: If the subtree for this node has a population within epsilon + # of the ideal, then add it to the cuts list. + e = (node, pred[node]) # get the edge from the parent to this node wt = random.random() + # frm: Add the cut - set its weight if it does not already have one + # and remember all of the nodes in the subtree in the frozenset cuts.append( Cut( edge=e, - weight=h.graph.edges[e].get("random_weight", wt), + # frm: Original Code: weight=h.graph.edges[e].get("random_weight", wt), + # frm: edges vs. edge_ids: edge_ids are wanted here (integers) + weight=h.graph.edge_data( + h.graph.get_edge_id_from_edge(e) + ).get("random_weight", wt), subset=frozenset(_part_nodes(node, succ)), ) ) elif abs((total_pop - tree_pop) - h.ideal_pop) <= h.ideal_pop * h.epsilon: + # frm: If the population of everything ABOVE this node in the tree is + # within epsilon of the ideal, then add it to the cut list too. e = (node, pred[node]) wt = random.random() cuts.append( Cut( edge=e, - weight=h.graph.edges[e].get("random_weight", wt), + # frm: Original Code: weight=h.graph.edges[e].get("random_weight", wt), + # frm: edges vs. edge_ids: edge_ids are wanted here (integers) + weight=h.graph.edge_data( + h.graph.get_edge_id_from_edge(e) + ).get("random_weight", wt), subset=frozenset(set(h.graph.nodes) - _part_nodes(node, succ)), ) ) return cuts + # frm: TODO: Refactor this code to make its two use cases clearer: + # + # One use case is bisecting the graph (one_sided_cut is False). The + # other use case is to peel off one part (district) with the appropriate + # population. + # + # Not quite clear yet exactly how to do this, but a return stmt in the middle + # of the routine (above) is a clear sign that something is odd. Perhaps + # we keep the existing function signature but immediately split the code + # into calls on two separate routines - one for each use case. + + # We are looking for a way to bisect the graph (one_sided_cut is False) for node, tree_pop in subtree_pops.items(): + if (abs(tree_pop - h.ideal_pop) <= h.ideal_pop * h.epsilon) and ( abs((total_pop - tree_pop) - h.ideal_pop) <= h.ideal_pop * h.epsilon ): @@ -417,13 +794,17 @@ def find_balanced_edge_cuts_memoization( cuts.append( Cut( edge=e, - weight=h.graph.edges[e].get("random_weight", wt), + # frm: Original Code: weight=h.graph.edges[e].get("random_weight", wt), + # frm: edges vs. edge_ids: edge_ids are wanted here (integers) + weight=h.graph.edge_data( + h.graph.get_edge_id_from_edge(e) + ).get("random_weight", wt), subset=frozenset(set(h.graph.nodes) - _part_nodes(node, succ)), ) ) return cuts - +# frm: only used in this file and in a test class BipartitionWarning(UserWarning): """ Generally raised when it is proving difficult to find a balanced cut. @@ -431,7 +812,7 @@ class BipartitionWarning(UserWarning): pass - +# frm: only used in this file and in a test class ReselectException(Exception): """ Raised when the tree-splitting algorithm is unable to find a @@ -477,9 +858,20 @@ def _max_weight_choice(cut_edge_list: List[Cut]) -> Cut: if not isinstance(cut_edge_list[0], Cut) or cut_edge_list[0].weight is None: return random.choice(cut_edge_list) + # frm: ???: this strikes me as possibly expensive. Computing the + # max in a list is O(N) so not terrible, but this + # might be called lots of times (need to know more about + # how it is used). Would it make sense to have the + # cut_edge_list sorted before it is frozen? I think it + # is now a set, so it would need to be a list... Not + # urgent, but worth looking into at some point... + # return max(cut_edge_list, key=lambda cut: cut.weight) +# frm: ???: Only ever used once... +# frm: ???: TODO: Figure out what this does. There is no NX/RX issue here, I just +# don't yet know what it does or why... def _power_set_sorted_by_size_then_sum(d): power_set = [ s for i in range(1, len(d) + 1) for s in itertools.combinations(d.keys(), i) @@ -501,6 +893,8 @@ def _power_set_sorted_by_size_then_sum(d): def _region_preferred_max_weight_choice( populated_graph: PopulatedGraph, region_surcharge: Dict, cut_edge_list: List[Cut] ) -> Cut: + # frm: ???: There is no NX/RX dependency in this routine, but I do + # not yet understand what it does or why... """ This function is used in the case of a region-aware chain. It is similar to the as :meth:`_max_weight_choice` function except @@ -551,9 +945,26 @@ def _region_preferred_max_weight_choice( # Prepare data for efficient access edge_region_info = { cut: { + #frm: This code is a bit dense (at least for me). + # Given a cut_edge_list (whose elements have an + # attribute, "edge",) construct a dict + # that associates with each "cut" the + # values of the region_surcharge values + # for both nodes in the edge. + # + # So, if the region_surcharge dict was + # {"muni": 0.2, "water": 0.8} then for + # each cut, cut_n, there would be a + # dict value that looked like: + # {"muni": ("siteA", "siteA", + # "water": ("water1", "water2") + # } + # key: ( - populated_graph.graph.nodes[cut.edge[0]].get(key), - populated_graph.graph.nodes[cut.edge[1]].get(key), + # frm: original code: populated_graph.graph.nodes[cut.edge[0]].get(key), + # frm: original code: populated_graph.graph.nodes[cut.edge[1]].get(key), + populated_graph.graph.node_data(cut.edge[0]).get(key), + populated_graph.graph.node_data(cut.edge[1]).get(key), ) for key in region_surcharge } @@ -577,14 +988,43 @@ def _region_preferred_max_weight_choice( return _max_weight_choice(cut_edge_list) +# frm TODO: RX version NYI... def bipartition_tree( +# +# This might get complicated depending on what kinds of functions +# are used as parameters. That is, do the functions used as parameters +# assume they are working with an NX graph? +# +# I think all of the functions used as parameters have been converted +# to work on the new Graph object, but perhaps end users have created +# their own? Should probably add logic to verify that the +# functions are not written to be operating on an NX Graph. Not sure +# how to do that though... +# +# Peter's comments from PR: +# +# Users do sometimes write custom spanning tree and cut edge functions. My +# recommendation would be to make this simple for now. Have a list of "RX_compatible" +# functions and then have the MarkovChain class do some coersion to store an +# appropriate graph and partition object at initialization. We always expect +# the workflow to be something like +# +# Graph -> Partition -> MarkovChain +# +# But we do copy operations in each step, so I wouldn't expect any weird +# side-effects from pushing the determination of what graph type to use +# off onto the MarkovChain class + +# frm: used in this file and in tree_proposals.py +# But maybe this is intended to be used externally... +# def bipartition_tree( - graph: nx.Graph, + subgraph_to_split: Graph, # frm: Original code: graph: nx.Graph, pop_col: str, pop_target: Union[int, float], epsilon: float, node_repeats: int = 1, - spanning_tree: Optional[nx.Graph] = None, + spanning_tree: Optional[Graph] = None, # frm: Original code: spanning_tree: Optional[nx.Graph] = None, spanning_tree_fn: Callable = random_spanning_tree, region_surcharge: Optional[Dict] = None, balance_edge_fn: Callable = find_balanced_edge_cuts_memoization, @@ -595,6 +1035,9 @@ def bipartition_tree( allow_pair_reselection: bool = False, cut_choice: Callable = _region_preferred_max_weight_choice, ) -> Set: + # frm: TODO: Change the names of ALL function formal parameters to end in "_fn" - to make it clear + # that the paraemter is a function. This will make it easier to do a global search + # to find all function parameters - as well as just being good coding practice... """ This function finds a balanced 2 partition of a graph by drawing a spanning tree and finding an edge to cut that leaves at most an epsilon @@ -661,40 +1104,60 @@ def bipartition_tree( given by ``max_attempts``. """ # Try to add the region-aware in if the spanning_tree_fn accepts a surcharge dictionary + # frm ???: REALLY??? You are going to change the semantics of your program based on the + # a function argument's signature? What if someone refactors the code to have + # different names??? *sigh* + # if "region_surcharge" in signature(spanning_tree_fn).parameters: spanning_tree_fn = partial(spanning_tree_fn, region_surcharge=region_surcharge) if "one_sided_cut" in signature(balance_edge_fn).parameters: balance_edge_fn = partial(balance_edge_fn, one_sided_cut=one_sided_cut) - populations = {node: graph.nodes[node][pop_col] for node in graph.node_indices} + # frm: original code: populations = {node: graph.nodes[node][pop_col] for node in graph.node_indices} + populations = {node_id: subgraph_to_split.node_data(node_id)[pop_col] for node_id in subgraph_to_split.node_indices} possible_cuts: List[Cut] = [] if spanning_tree is None: - spanning_tree = spanning_tree_fn(graph) + # frm TODO: Make sure spanning_tree_fn operates on new Graph object + spanning_tree = spanning_tree_fn(subgraph_to_split) restarts = 0 attempts = 0 while max_attempts is None or attempts < max_attempts: if restarts == node_repeats: - spanning_tree = spanning_tree_fn(graph) + # frm TODO: Make sure spanning_tree_fn operates on new Graph object + # frm: ???: Not sure what this if-stmt is for... + spanning_tree = spanning_tree_fn(subgraph_to_split) restarts = 0 h = PopulatedGraph(spanning_tree, populations, pop_target, epsilon) + # frm: ???: TODO: Again - we should NOT be changing semantics based + # on the names in signatures... is_region_cut = ( "region_surcharge" in signature(cut_choice).parameters and "populated_graph" in signature(cut_choice).parameters ) + # frm: Find one or more edges in the spanning tree, that if cut would + # result in a subtree with the appropriate population. + # This returns a list of Cut objects with attributes edge and subset possible_cuts = balance_edge_fn(h, choice=choice) + # frm: RX Subgraph if len(possible_cuts) != 0: + chosen_cut = None if is_region_cut: - return cut_choice(h, region_surcharge, possible_cuts).subset - - return cut_choice(possible_cuts).subset + chosen_cut = cut_choice(h, region_surcharge, possible_cuts) + else: + chosen_cut = cut_choice(possible_cuts) + translated_nodes = subgraph_to_split.translate_subgraph_node_ids_for_set_of_nodes( + chosen_cut.subset + ) + # frm: Not sure if it is important that the returned set be a frozenset... + return frozenset(translated_nodes) restarts += 1 attempts += 1 @@ -708,7 +1171,7 @@ def bipartition_tree( "a different pair of districts for recombination.", BipartitionWarning, ) - + if allow_pair_reselection: raise ReselectException( f"Failed to find a balanced cut after {max_attempts} attempts.\n" @@ -718,19 +1181,42 @@ def bipartition_tree( raise RuntimeError(f"Could not find a possible cut after {max_attempts} attempts.") +# frm TODO: Note: Re: _bipartition_tree_random_all() +# +# There were a couple of interesting issues surrounding this routine in the original code +# related to subgraphs. The question was whether or not to translate HERE + +# frm: WTF: TODO: This function has a leading underscore indicating that it is a private +# function, but in fact it is used in tree_proposals.py... It also returns +# Cuts which I had hoped would be an internal data structure, but... +# frm: RX-TODO This is called in tree_proposals.py with a subgraph, so it needs to +# return translated Cut objects. However, it is also called internally in +# this code. I need to make sure that I do not translate the node_ids to the +# parent_node_ids twice. At present, they are converted in this file by the +# caller, but that won't work in tree_proposals.py, because there it is called +# with a subgraph, so it would be too late to try to do it in the caller. +# +# Two options: 1) Have this routine do the translation and then comment the +# crap out of the call in this file to make sure we do NOT translate them again, or +# 2) figure out a way to get this OUT of tree_proposals.py where it seems it should +# not be in the first place... +# def _bipartition_tree_random_all( - graph: nx.Graph, + # frm: Note: Changed the name from "graph" to "subgraph_to_split" to remind any future readers + # of the code that the graph passed in is not the partition's graph, and + # that any node_ids passed back should be translated into parent_node_ids. + subgraph_to_split: Graph, # frm: Original code: graph: nx.Graph, pop_col: str, pop_target: Union[int, float], epsilon: float, node_repeats: int = 1, repeat_until_valid: bool = True, - spanning_tree: Optional[nx.Graph] = None, + spanning_tree: Optional[Graph] = None, # frm: Original code: spanning_tree: Optional[nx.Graph] = None, spanning_tree_fn: Callable = random_spanning_tree, balance_edge_fn: Callable = find_balanced_edge_cuts_memoization, choice: Callable = random.choice, max_attempts: Optional[int] = 100000, -) -> List[Tuple[Hashable, Hashable]]: +) -> List[Tuple[Hashable, Hashable]]: # frm: TODO: Change this to be a set of node_ids (ints) """ Randomly bipartitions a tree into two subgraphs until a valid bipartition is found. @@ -770,23 +1256,31 @@ def _bipartition_tree_random_all( attempts. """ - populations = {node: graph.nodes[node][pop_col] for node in graph.node_indices} + # frm: original code: populations = {node: graph.nodes[node][pop_col] for node in graph.node_indices} + populations = { + node_id: subgraph_to_split.node_data(node_id)[pop_col] + for node_id in subgraph_to_split.node_indices + } possible_cuts = [] if spanning_tree is None: - spanning_tree = spanning_tree_fn(graph) + # frm TODO: Make sure spanning_tree_fn works on new Graph object + spanning_tree = spanning_tree_fn(subgraph_to_split) restarts = 0 attempts = 0 while max_attempts is None or attempts < max_attempts: if restarts == node_repeats: - spanning_tree = spanning_tree_fn(graph) + # frm TODO: Make sure spanning_tree_fn works on new Graph object + spanning_tree = spanning_tree_fn(subgraph_to_split) restarts = 0 h = PopulatedGraph(spanning_tree, populations, pop_target, epsilon) possible_cuts = balance_edge_fn(h, choice=choice) + # frm: RX-TODO: Translate cuts into node_id context of the parent. if not (repeat_until_valid and len(possible_cuts) == 0): + # frm: TODO: Remove deubgging code: return possible_cuts restarts += 1 @@ -794,15 +1288,129 @@ def _bipartition_tree_random_all( raise RuntimeError(f"Could not find a possible cut after {max_attempts} attempts.") +# frm: used in this file and in tree_proposals.py +# But maybe this is intended to be used externally... + +####################### +# frm: Note: This routine is EXACTLY the same as bipartition_tree_random() except +# that it returns in addition to the nodes for a new district, the +# number of possible new districts. This additional information +# is needed by reversible_recom(), but I did not want to change the +# function signature of bipartition_tree_random() in case it is used +# as part of the public API by someone. +# +# It is bad form to have two functions that are the same excpet for +# a tweak - an invitation for future bugs when you fix something in +# one place and not the other, so maybe this is something we should +# revisit when we decide a general code cleanup is in order... +# +def bipartition_tree_random_with_num_cuts( + graph: Graph, # frm: Original code: graph: nx.Graph, + pop_col: str, + pop_target: Union[int, float], + epsilon: float, + node_repeats: int = 1, + repeat_until_valid: bool = True, + spanning_tree: Optional[Graph] = None, # frm: Original code: spanning_tree: Optional[nx.Graph] = None, + spanning_tree_fn: Callable = random_spanning_tree, + balance_edge_fn: Callable = find_balanced_edge_cuts_memoization, + one_sided_cut: bool = False, + choice: Callable = random.choice, + max_attempts: Optional[int] = 100000, +) -> Union[Set[Any], None]: + """ + This is like :func:`bipartition_tree` except it chooses a random balanced + cut, rather than the first cut it finds. + + This function finds a balanced 2 partition of a graph by drawing a + spanning tree and finding an edge to cut that leaves at most an epsilon + imbalance between the populations of the parts. If a root fails, new roots + are tried until node_repeats in which case a new tree is drawn. + + Builds up a connected subgraph with a connected complement whose population + is ``epsilon * pop_target`` away from ``pop_target``. + + :param graph: The graph to partition. + :type graph: nx.Graph + :param pop_col: The node attribute holding the population of each node. + :type pop_col: str + :param pop_target: The target population for the returned subset of nodes. + :type pop_target: Union[int, float] + :param epsilon: The allowable deviation from ``pop_target`` (as a percentage of + ``pop_target``) for the subgraph's population. + :type epsilon: float + :param node_repeats: A parameter for the algorithm: how many different choices + of root to use before drawing a new spanning tree. Defaults to 1. + :type node_repeats: int + :param repeat_until_valid: Determines whether to keep drawing spanning trees + until a tree with a balanced cut is found. If `True`, a set of nodes will + always be returned; if `False`, `None` will be returned if a valid spanning + tree is not found on the first try. Defaults to True. + :type repeat_until_valid: bool, optional + :param spanning_tree: The spanning tree for the algorithm to use (used when the + algorithm chooses a new root and for testing). Defaults to None. + :type spanning_tree: Optional[nx.Graph], optional + :param spanning_tree_fn: The random spanning tree algorithm to use if a spanning + tree is not provided. Defaults to :func:`random_spanning_tree`. + :type spanning_tree_fn: Callable, optional + :param balance_edge_fn: The algorithm used to find balanced cut edges. Defaults to + :func:`find_balanced_edge_cuts_memoization`. + :type balance_edge_fn: Callable, optional + :param one_sided_cut: Passed to the ``balance_edge_fn``. Determines whether or not we are + cutting off a single district when partitioning the tree. When + set to False, we check if the node we are cutting and the remaining graph + are both within epsilon of the ideal population. When set to True, we only + check if the node we are cutting is within epsilon of the ideal population. + Defaults to False. + :type one_sided_cut: bool, optional + :param choice: The random choice function. Can be substituted for testing. Defaults + to :func:`random.choice`. + :type choice: Callable, optional + :param max_attempts: The max number of attempts that should be made to bipartition. + Defaults to None. + :type max_attempts: Optional[int], optional + + :returns: A subset of nodes of ``graph`` (whose induced subgraph is connected) or None if a + valid spanning tree is not found. + :rtype: Union[Set[Any], None] + """ + + # frm: ???: TODO: Again - semantics should not depend on signatures... + if "one_sided_cut" in signature(balance_edge_fn).parameters: + balance_edge_fn = partial(balance_edge_fn, one_sided_cut=True) + possible_cuts = _bipartition_tree_random_all( + subgraph_to_split=graph, + pop_col=pop_col, + pop_target=pop_target, + epsilon=epsilon, + node_repeats=node_repeats, + repeat_until_valid=repeat_until_valid, + spanning_tree=spanning_tree, + spanning_tree_fn=spanning_tree_fn, + balance_edge_fn=balance_edge_fn, + choice=choice, + max_attempts=max_attempts, + ) + if possible_cuts: + chosen_cut = choice(possible_cuts) + num_cuts = len(possible_cuts) + parent_nodes = graph.translate_subgraph_node_ids_for_set_of_nodes(chosen_cut.subset) + return num_cuts, frozenset(parent_nodes) # frm: Not sure if important that it be frozenset + else: + # frm: TODO: Grok when this returns None and why and what the calling code does and why... + return None + +####################### +# frm TODO: RX version NYI... def bipartition_tree_random( - graph: nx.Graph, + subgraph_to_split: Graph, # frm: Original code: graph: nx.Graph, pop_col: str, pop_target: Union[int, float], epsilon: float, node_repeats: int = 1, repeat_until_valid: bool = True, - spanning_tree: Optional[nx.Graph] = None, + spanning_tree: Optional[Graph] = None, # frm: Original code: spanning_tree: Optional[nx.Graph] = None, spanning_tree_fn: Callable = random_spanning_tree, balance_edge_fn: Callable = find_balanced_edge_cuts_memoization, one_sided_cut: bool = False, @@ -865,11 +1473,25 @@ def bipartition_tree_random( valid spanning tree is not found. :rtype: Union[Set[Any], None] """ + + # frm: ???: TODO: Again - semantics should not depend on signatures... + # + # This is odd - there are two balance_edge_functions defined in tree.py but + # both of them have a formal parameter with the name "one_sided_cut", so this + # code is not picking one of them. Perhaps there was an earlier version of + # the code where it allowed functions that did not support "one_sided_cut". + # In any event, it looks like this if-stmt is a no-op as far as the current + # codebase is concerned... + # + # Even odder - there is a formal parameter, one_sided_cut, which is never + # used... + + if "one_sided_cut" in signature(balance_edge_fn).parameters: balance_edge_fn = partial(balance_edge_fn, one_sided_cut=True) possible_cuts = _bipartition_tree_random_all( - graph=graph, + subgraph_to_split=subgraph_to_split, pop_col=pop_col, pop_target=pop_target, epsilon=epsilon, @@ -882,11 +1504,17 @@ def bipartition_tree_random( max_attempts=max_attempts, ) if possible_cuts: - return choice(possible_cuts).subset + chosen_cut = choice(possible_cuts) + translated_nodes = subgraph_to_split.translate_subgraph_node_ids_for_set_of_nodes(chosen_cut.subset) + return frozenset(translated_nodes) # frm: Not sure if important that it be frozenset +# frm: used in this file and in tree_proposals.py +# But maybe this is intended to be used externally... +# frm TODO: RX version NYI... +# frm: Note that this routine is only used in recom() def epsilon_tree_bipartition( - graph: nx.Graph, + subgraph_to_split: Graph, # frm: Original code: graph: nx.Graph, parts: Sequence, pop_target: Union[float, int], pop_col: str, @@ -926,14 +1554,14 @@ def epsilon_tree_bipartition( ) flips = {} - remaining_nodes = graph.node_indices + remaining_nodes = subgraph_to_split.node_indices lb_pop = pop_target * (1 - epsilon) ub_pop = pop_target * (1 + epsilon) check_pop = lambda x: lb_pop <= x <= ub_pop nodes = method( - graph.subgraph(remaining_nodes), + subgraph_to_split.subgraph(remaining_nodes), pop_col=pop_col, pop_target=pop_target, epsilon=epsilon, @@ -944,10 +1572,16 @@ def epsilon_tree_bipartition( if nodes is None: raise BalanceError() + # Calculate the total population for the two districts based on the + # results of the "method()" partitioning. part_pop = 0 for node in nodes: + # frm: ???: The code above has already confirmed that len(parts) is 2 + # so why use negative index values - why not just use + # parts[0] and parts[1]? flips[node] = parts[-2] - part_pop += graph.nodes[node][pop_col] + # frm: original code: part_pop += graph.nodes[node][pop_col] + part_pop += subgraph_to_split.node_data(node)[pop_col] if not check_pop(part_pop): raise PopulationBalanceError() @@ -958,18 +1592,27 @@ def epsilon_tree_bipartition( part_pop = 0 for node in remaining_nodes: flips[node] = parts[-1] - part_pop += graph.nodes[node][pop_col] + # frm: original code: part_pop += graph.nodes[node][pop_col] + part_pop += subgraph_to_split.node_data(node)[pop_col] if not check_pop(part_pop): raise PopulationBalanceError() - return flips + # translate subgraph node_ids back into node_ids in parent graph + translated_flips = subgraph_to_split.translate_subgraph_node_ids_for_flips(flips) + + return translated_flips + + # frm: TODO: I think I need to translate flips elsewhere - need to check... # TODO: Move these recursive partition functions to their own module. They are not # central to the operation of the recom function despite being tree methods. +# frm: defined here but only used in partition.py +# But maybe this is intended to be used externally... +# frm TODO: RX version NYI... def recursive_tree_part( - graph: nx.Graph, + graph: Graph, # frm: Original code: graph: nx.Graph, parts: Sequence, pop_target: Union[float, int], pop_col: str, @@ -1018,6 +1661,16 @@ def recursive_tree_part( ub_pop = pop_target * (1 + epsilon) check_pop = lambda x: lb_pop <= x <= ub_pop + # frm: Notes to self: The code in the for-loop creates n-2 districts (where n is + # the number of partitions desired) by calling the "method" + # function, whose job it is to produce a connected set of + # nodes that has the desired population target. + # + # Note that it sets one_sided_cut=True which tells the + # "method" function that it is NOT bisecting the graph + # but is rather supposed to just find one connected + # set of nodes of the correct population size. + for part in parts[:-2]: min_pop = max(pop_target * (1 - epsilon), pop_target * (1 - epsilon) - debt) max_pop = min(pop_target * (1 + epsilon), pop_target * (1 + epsilon) - debt) @@ -1041,7 +1694,8 @@ def recursive_tree_part( part_pop = 0 for node in nodes: flips[node] = part - part_pop += graph.nodes[node][pop_col] + # frm: original code: part_pop += graph.nodes[node][pop_col] + part_pop += graph.node_data(node)[pop_col] if not check_pop(part_pop): raise PopulationBalanceError() @@ -1051,6 +1705,10 @@ def recursive_tree_part( # After making n-2 districts, we need to make sure that the last # two districts are both balanced. + + # frm: For the last call to "method", set one_sided_cut=False to + # request that "method" create two equal sized districts + # with the given population goal by bisecting the graph. nodes = method( graph.subgraph(remaining_nodes), pop_col=pop_col, @@ -1066,7 +1724,15 @@ def recursive_tree_part( part_pop = 0 for node in nodes: flips[node] = parts[-2] - part_pop += graph.nodes[node][pop_col] + # frm: this code fragment: graph.nodes[node][pop_col] is used + # many times and is a candidate for being wrapped with + # a function that has a meaningful name, such as perhaps: + # get_population_for_node(node, pop_col). + # This is an example of code-bloat from the perspective of + # code gurus, but it really helps a new code reviewer understand + # WTF is going on... + # frm: original code: part_pop += graph.nodes[node][pop_col] + part_pop += graph.node_data(node)[pop_col] if not check_pop(part_pop): raise PopulationBalanceError() @@ -1077,16 +1743,18 @@ def recursive_tree_part( part_pop = 0 for node in remaining_nodes: flips[node] = parts[-1] - part_pop += graph.nodes[node][pop_col] + # frm: original code: part_pop += graph.nodes[node][pop_col] + part_pop += graph.node_data(node)[pop_col] if not check_pop(part_pop): raise PopulationBalanceError() return flips - -def get_seed_chunks( - graph: nx.Graph, +# frm: only used in this file, so I changed the name to have a leading underscore +# frm TODO: RX version NYI... +def _get_seed_chunks( + graph: Graph, # frm: Original code: graph: nx.Graph, num_chunks: int, num_dists: int, pop_target: Union[int, float], @@ -1122,22 +1790,49 @@ def get_seed_chunks( :returns: New assignments for the nodes of ``graph``. :rtype: List[List[int]] """ + + # frm: ??? TODO: Change the name of num_chunks_left to instead be num_districts_per_chunk. + # frm: ???: It is not clear to me when num_chunks will not evenly divide num_dists. In + # the only place where _get_seed_chunks() is called, it is inside an if-stmt + # branch that validates that num_chunks evenly divides num_dists... + # num_chunks_left = num_dists // num_chunks + + # frm: ???: TODO: Change the name of parts below to be something / anything else. Normally + # parts refers to districts, but here is is just a way to keep track of + # sets of nodes for chunks. Yes - they eventually become districts when + # this code gets to the base cases, but I found it confusing at this + # level... + # parts = range(num_chunks) + # frm: ???: I think that new_epsilon is the epsilon to use for each district, in which + # case the epsilon passed in would be for the HERE... new_epsilon = epsilon / (num_chunks_left * num_chunks) if num_chunks_left == 1: new_epsilon = epsilon chunk_pop = 0 for node in graph.node_indices: - chunk_pop += graph.nodes[node][pop_col] + # frm: original code: chunk_pop += graph.nodes[node][pop_col] + chunk_pop += graph.node_data(node)[pop_col] + # frm: TODO: See if there is a better way to structure this instead of a while True loop... while True: epsilon = abs(epsilon) flips = {} remaining_nodes = set(graph.nodes) + # frm: ??? What is the distinction between num_chunks and num_districts? + # I think that a chunk is typically a multiple of districts, so + # if we want 15 districts we might only ask for 5 chunks. Stated + # differently a chunk will always have at least enough nodes + # for a given number of districts. As the chunk size gets + # smaller, the number of nodes more closely matches what + # is needed for a set number of districts. + + # frm: Note: This just scales epsilon by the number of districts for each chunk + # so we can get chunks with the appropriate population sizes... min_pop = pop_target * (1 - new_epsilon) * num_chunks_left max_pop = pop_target * (1 + new_epsilon) * num_chunks_left @@ -1146,6 +1841,26 @@ def get_seed_chunks( diff = min(max_pop - chunk_pop_target, chunk_pop_target - min_pop) new_new_epsilon = diff / chunk_pop_target + # frm: Note: This code is clever... It loops through all of the + # parts (districts) except the last, and on each + # iteration, it finds nodes for the given part. + # Each time through the loop it assigns the + # unassigned nodes to the last part, but + # most of this gets overwritten by the next + # iteration, so that at the end the only nodes + # still assigned to the last part are the ones + # that had not been previously assigned. + # + # It works, but is a little too clever for me. + # + # I would just have assigned all nodes to + # the last part before entering the loop + # with a comment saying that by end of loop + # the nodes not assigned in the loop will + # default to the last part. + # + + # Assign all nodes to one of the parts for i in range(len(parts[:-1])): part = parts[i] @@ -1168,13 +1883,22 @@ def get_seed_chunks( for node in remaining_nodes: flips[node] = parts[-1] + # frm: ???: Look at remaining_nodes to see if we are done part_pop = 0 + # frm: ???: Compute population total for remaining nodes. for node in remaining_nodes: - part_pop += graph.nodes[node][pop_col] + # frm: original code: part_pop += graph.nodes[node][pop_col] + part_pop += graph.node_data(node)[pop_col] + # frm: ???: Compute what the population total would be for each district in chunk part_pop_as_dist = part_pop / num_chunks_left fake_epsilon = epsilon + # frm: ???: If the chunk is for more than one district, divide epsilon by two if num_chunks_left != 1: fake_epsilon = epsilon / 2 + # frm: ???: Calculate max and min populations on a district level + # This will just be based on epsilon if we only want one district from chunk, but + # it will be based on half of epsilon if we want more than one district from chunk. + # This is odd - why wouldn't we use an epsilon min_pop_as_dist = pop_target * (1 - fake_epsilon) max_pop_as_dist = pop_target * (1 + fake_epsilon) @@ -1193,7 +1917,8 @@ def get_seed_chunks( return list(chunks.values()) - +# frm: only used in this file +# But maybe this is intended to be used externally... def get_max_prime_factor_less_than(n: int, ceil: int) -> Optional[int]: """ Helper function for recursive_seed_part_inner. Returns the largest prime factor of ``n`` @@ -1229,9 +1954,11 @@ def get_max_prime_factor_less_than(n: int, ceil: int) -> Optional[int]: return largest_factor - +# frm: only used in this file +# But maybe this is intended to be used externally... +# frm TODO: Peter says this is only ever used internally, so we can add underscore to the name def recursive_seed_part_inner( - graph: nx.Graph, + graph: Graph, # frm: Original code: graph: nx.Graph, num_dists: int, pop_target: Union[float, int], pop_col: str, @@ -1245,6 +1972,16 @@ def recursive_seed_part_inner( Inner function for recursive_seed_part. Returns a partition with ``num_dists`` districts balanced within ``epsilon`` of ``pop_target``. + + frm: ???: TODO: Correct the above statement that this function returns a partition. + In fact, it returns a list of sets of nodes, which is conceptually + equivalent to a partition, but is not a Partition object. Each + set of nodes constitutes a district, but the district does not + have an ID, and there is nothing that associates these nodes + with a specific graph - that is implicit, depending on the graph + object passed in, so the caller is responsible for knowing that + the returned list of sets belongs to the graph passed in... + Splits graph into num_chunks chunks, and then recursively splits each chunk into ``num_dists``/num_chunks chunks. The number num_chunks of chunks is chosen based on ``n`` and ``ceil`` as follows: @@ -1259,6 +1996,13 @@ def recursive_seed_part_inner( this function bites off a single district from the graph and recursively partitions the remaining graph into ``num_dists - 1`` districts. + frm: ???: OK, but why is the logic above for num_chunks the correct number? Is there + a mathematical reason for it? I assume so, but that explanation is missing... + + I presume that the reason is that something in the code that finds a + district scales exponentially, so it makes sense to divide and conquer. + Even so, why this particular strategy for divide and conquer? + :param graph: The underlying graph structure. :type graph: nx.Graph :param num_dists: number of districts to partition the graph into @@ -1292,6 +2036,18 @@ def recursive_seed_part_inner( :rtype: List of sets, each set is a district """ + """ + frm: This code is quite nice once you grok it. + + The goal is to find the given number of districts - but to do it in an + efficient way - meaning with smaller graphs. So conceptually, you want + to + HERE + + There are two base cases when the number of districts still to be found are + either 1 or + + """ # Chooses num_chunks if n is None: if ceil is None: @@ -1301,17 +2057,28 @@ def recursive_seed_part_inner( else: raise ValueError("ceil must be None or at least 2") elif n > 1: + # frm: Note: This is not guaranteed to evenly divide num_dists num_chunks = n else: raise ValueError("n must be None or a positive integer") # base case if num_dists == 1: - return [set(graph.nodes)] + # Just return an assignment with all of the nodes in the graph + # Translate the node_ids into parent_node_ids + translated_set_of_nodes = graph.translate_subgraph_node_ids_for_set_of_nodes( + graph.node_indices + ) + translated_assignment = [] + translated_assignment.append(translated_set_of_nodes) + return translated_assignment + + # frm: In the case when there are exactly 2 districts, split the graph by setting + # one_sided_cut to be False. if num_dists == 2: nodes = method( - graph, + graph.subgraph(graph.node_indices), # needs to be a subgraph pop_col=pop_col, pop_target=pop_target, epsilon=epsilon, @@ -1319,9 +2086,36 @@ def recursive_seed_part_inner( one_sided_cut=False, ) - return [set(nodes), set(graph.nodes) - set(nodes)] + # frm: Note to Self: the name "one_sided_cut" seems unnecessarily opaque. What it really + # means is whether to split the graph into two equal districts or + # whether to just find one district from a larger graph. When we + # clean up this code, consider changing the name of this parameter + # to something like: find_two_equal_sized_districts... + # + # Consider creating a wrapper function which has the better + # name that delegates to a private method to do the work. + + nodes_for_one_district = set(nodes) + nodes_for_the_other_district = set(graph.node_indices) - nodes_for_one_district + + # Translate the subgraph node_ids into parent_node_ids + translated_set_1 = graph.translate_subgraph_node_ids_for_set_of_nodes( + nodes_for_one_district + ) + translated_set_2 = graph.translate_subgraph_node_ids_for_set_of_nodes( + nodes_for_the_other_district + ) + + return [translated_set_1, translated_set_2] # bite off a district and recurse into the remaining subgraph + # frm: Note: In the case when num_chunks does not evenly divide num_dists, + # just find one district, remove those nodes from + # the unassigned nodes and try again with num_dists + # set to be one less. Stated differently, reduce + # number of desired districts until you get to + # one that is evenly divided by num_chunks and then + # do chunk stuff... elif num_chunks is None or num_dists % num_chunks != 0: remaining_nodes = set(graph.nodes) nodes = method( @@ -1333,6 +2127,8 @@ def recursive_seed_part_inner( one_sided_cut=True, ) remaining_nodes -= nodes + # frm: Create a list with the set of nodes returned by method() and then recurse + # to get the rest of the sets of nodes for remaining districts. assignment = [nodes] + recursive_seed_part_inner( graph.subgraph(remaining_nodes), num_dists - 1, @@ -1345,9 +2141,10 @@ def recursive_seed_part_inner( ) # split graph into num_chunks chunks, and recurse into each chunk + # frm: TODO: Add documentation for why a subgraph in call below elif num_dists % num_chunks == 0: - chunks = get_seed_chunks( - graph, + chunks = _get_seed_chunks( + graph.subgraph(graph.node_indices), # needs to be a subgraph num_chunks, num_dists, pop_target, @@ -1360,7 +2157,7 @@ def recursive_seed_part_inner( for chunk in chunks: chunk_assignment = recursive_seed_part_inner( graph.subgraph(chunk), - num_dists // num_chunks, + num_dists // num_chunks, # new target number of districts pop_target, pop_col, epsilon, @@ -1369,12 +2166,30 @@ def recursive_seed_part_inner( ceil=ceil, ) assignment += chunk_assignment + else: + # frm: From the logic above, this should never happen, but if it did + # because of a future edit (bug), at least this will catch it + # early before really bizarre things happen... + raise Exception("recursive_seed_part_inner(): Should never happen...") + + # The assignment object that has been created needs to have its + # node_ids translated into parent_node_ids + + translated_assignment = [] + for set_of_nodes in assignment: + translated_set_of_nodes = graph.translate_subgraph_node_ids_for_set_of_nodes( + set_of_nodes + ) + translated_assignment.append(translated_set_of_nodes) + + return translated_assignment - return assignment +# frm ???: This routine is never called - not in this file and not in any other GerryChain file. +# Is it intended to be used by end-users? And if so, for what purpose? def recursive_seed_part( - graph: nx.Graph, + graph: Graph, # frm: Original code: graph: nx.Graph, parts: Sequence, pop_target: Union[float, int], pop_col: str, @@ -1420,9 +2235,24 @@ def recursive_seed_part( :returns: New assignments for the nodes of ``graph``. :rtype: dict """ + + # frm: Note: It is not strictly necessary to use a subgraph in the call below on + # recursive_seed_part_inner(), because the top-level graph has + # a _node_id_to_parent_node_id_map that just maps node_ids to themselves. However, + # it seemed a good practice to ALWAYS call routines that are intended + # to deal with subgraphs, to use a subgraph even when not strictly + # necessary. Just one more cognitive load to not have to worry about. + # + # This probably means that the identity _node_id_to_parent_node_id_map for top-level + # graphs will never be used, I still think that it makes sense to retain + # it - again, for consistency: Every graph knows how to translate to + # parent_node_ids even if it is a top-level graph. + # + # In short - an agrument based on invariants being a good thing... + # flips = {} assignment = recursive_seed_part_inner( - graph, + graph.subgraph(graph.node_indices), len(parts), pop_target, pop_col, @@ -1444,3 +2274,6 @@ class BalanceError(Exception): class PopulationBalanceError(Exception): """Raised when the population of a district is outside the acceptable epsilon range.""" + + + \ No newline at end of file diff --git a/gerrychain/updaters/compactness.py b/gerrychain/updaters/compactness.py index 7b42e201..aee6c887 100644 --- a/gerrychain/updaters/compactness.py +++ b/gerrychain/updaters/compactness.py @@ -16,13 +16,22 @@ def boundary_nodes(partition, alias: str = "boundary_nodes") -> Set: :returns: The set of nodes in the partition that are on the boundary. :rtype: Set """ + + # frm: TODO: Figure out what is going on with the "alias" parameter. + # It is used to get the value from the parent if there is + # a parent, but it is NOT used when computing the result + # for the first partition. Seems like a logic bug... + if partition.parent: return partition.parent[alias] - return { - node - for node in partition.graph.nodes - if partition.graph.nodes[node]["boundary_node"] - } + else: + result = { + node + for node in partition.graph.nodes + # frm: original code: if partition.graph.nodes[node]["boundary_node"] + if partition.graph.node_data(node)["boundary_node"] + } + return result def initialize_exterior_boundaries_as_a_set(partition) -> Dict[int, Set]: @@ -37,6 +46,7 @@ def initialize_exterior_boundaries_as_a_set(partition) -> Dict[int, Set]: part_boundaries = collections.defaultdict(set) for node in partition["boundary_nodes"]: part_boundaries[partition.assignment.mapping[node]].add(node) + return part_boundaries @@ -63,6 +73,16 @@ def exterior_boundaries_as_a_set( partition. :rtype: Set """ + # Compute the new set of boundary nodes for the partition. + # + # The term, (inflow & graph_boundary), computes new nodes that are boundary nodes. + # + # the term, (previous | (inflow & graph_boundary)), adds those new boundary nodes to the + # set of previous boundary nodes. + # + # Then all you need to do is subtract all of the nodes in the outflow to remove any of those + # that happen to be boundary nodes... + graph_boundary = partition["boundary_nodes"] return (previous | (inflow & graph_boundary)) - outflow @@ -80,7 +100,8 @@ def initialize_exterior_boundaries(partition) -> Dict[int, float]: boundaries = collections.defaultdict(lambda: 0) for node in graph_boundary: part = partition.assignment.mapping[node] - boundaries[part] += partition.graph.nodes[node]["boundary_perim"] + # frm: original code: boundaries[part] += partition.graph.nodes[node]["boundary_perim"] + boundaries[part] += partition.graph.node_data(node)["boundary_perim"] return boundaries @@ -107,11 +128,13 @@ def exterior_boundaries(partition, previous: Set, inflow: Set, outflow: Set) -> """ graph_boundary = partition["boundary_nodes"] added_perimeter = sum( - partition.graph.nodes[node]["boundary_perim"] + # frm: original code: partition.graph.nodes[node]["boundary_perim"] + partition.graph.node_data(node)["boundary_perim"] for node in inflow & graph_boundary ) removed_perimeter = sum( - partition.graph.nodes[node]["boundary_perim"] + # frm: original code: partition.graph.nodes[node]["boundary_perim"] + partition.graph.node_data(node)["boundary_perim"] for node in outflow & graph_boundary ) return previous + added_perimeter - removed_perimeter @@ -126,13 +149,46 @@ def initialize_interior_boundaries(partition): perimeter the given part shares with other parts. :rtype: Dict[int, float] """ - return { - part: sum( - partition.graph.edges[edge]["shared_perim"] + + # frm: RustworkX Note: + # + # The old NX code did not distinguish between edges and edge_ids - they were one + # and the same. However, in RX an edge is a tuple and an edge_id is an integer. + # The edges stored in partition["cut_edges_by_part"] are edges (tuples), so + # we need to get the edge_id for each edge in order to access the data for the edge. + + # frm: Original Code: + # return { + # part: sum( + # partition.graph.edges[edge]["shared_perim"] + # for edge in partition["cut_edges_by_part"][part] + # ) + # } + + # Get edge_ids for each edge (tuple) + edge_ids_for_part = { + part: [ + partition.graph.get_edge_id_from_edge(edge) for edge in partition["cut_edges_by_part"][part] + ] + for part in partition.parts + } + + edge_to_edge_id_map = [ + (edge, partition.graph.get_edge_id_from_edge(edge)) + for edge in partition.graph.edges + ] + + # Compute length of the shared perimeter of each part + shared_perimeters_for_part = { + part: sum( + partition.graph.edge_data(edge_id)["shared_perim"] + for edge_id in edge_ids_for_part[part] ) for part in partition.parts } + + return shared_perimeters_for_part @on_edge_flow(initialize_interior_boundaries, alias="interior_boundaries") @@ -159,11 +215,23 @@ def interior_boundaries( boundary of that part. :rtype: Dict """ + + # frm: TODO: NX vs. RX Issue - need to use edge_ids below to access edge information... + # I think I have done this already below... + added_perimeter = sum( - partition.graph.edges[edge]["shared_perim"] for edge in new_edges + # frm: Original Code: partition.graph.edges[edge]["shared_perim"] for edge in new_edges + # frm: edges vs edge_ids: edge_ids are wanted here (integers) + partition.graph.edge_data( + partition.graph.get_edge_id_from_edge(edge) + )["shared_perim"] for edge in new_edges ) removed_perimeter = sum( - partition.graph.edges[edge]["shared_perim"] for edge in old_edges + # frm: Original Code: partition.graph.edges[edge]["shared_perim"] for edge in old_edges + # frm: edges vs edge_ids: edge_ids are wanted here (integers) + partition.graph.edge_data( + partition.graph.get_edge_id_from_edge(edge) + )["shared_perim"] for edge in old_edges ) return previous + added_perimeter - removed_perimeter @@ -177,6 +245,7 @@ def flips(partition) -> Dict: given partition. :rtype: Dict """ + # frm: ???: Does anyone ever use this? It seems kind of useless... return partition.flips @@ -184,7 +253,7 @@ def perimeter_of_part(partition, part: int) -> float: """ Totals up the perimeter of the part in the partition. - .. Warning:: + .. Warning:: frm: TODO: Add code to enforce this warning... Requires that 'boundary_perim' be a node attribute, 'shared_perim' be an edge attribute, 'cut_edges' be an updater, and 'exterior_boundaries' be an updater. diff --git a/gerrychain/updaters/county_splits.py b/gerrychain/updaters/county_splits.py index fad28f4c..e76ae78e 100644 --- a/gerrychain/updaters/county_splits.py +++ b/gerrychain/updaters/county_splits.py @@ -79,21 +79,27 @@ def compute_county_splits( # Create the initial county data containers. if not partition.parent: + county_dict = dict() for node in partition.graph.node_indices: + + # First figure get current status of the county's information county = partition.graph.lookup(node, county_field) if county in county_dict: split, nodes, seen = county_dict[county] else: split, nodes, seen = CountySplit.NOT_SPLIT, [], set() + # Now update "nodes" and "seen" with this node and the part (district) from partition's assignment. nodes.append(node) seen.update(set([partition.assignment.mapping[node]])) + # lastly, if we have "seen" more than one part (district), then the county is split across parts. if len(seen) > 1: split = CountySplit.OLD_SPLIT + # update the county_dict with new information county_dict[county] = CountyInfo(split, nodes, seen) return county_dict @@ -145,17 +151,22 @@ def _get_splits(partition): def total_reg_splits(partition, reg_attr): """Returns the total number of times that reg_attr is split in the partition.""" all_region_names = set( - partition.graph.nodes[node][reg_attr] for node in partition.graph.nodes + # frm: original code: partition.graph.nodes[node][reg_attr] for node in partition.graph.nodes + partition.graph.node_data(node)[reg_attr] for node in partition.graph.nodes ) split = {name: 0 for name in all_region_names} # Require that the cut_edges updater is attached to the partition for node1, node2 in partition["cut_edges"]: if ( partition.assignment[node1] != partition.assignment[node2] - and partition.graph.nodes[node1][reg_attr] - == partition.graph.nodes[node2][reg_attr] + # frm: original code: and partition.graph.nodes[node1][reg_attr] + # frm: original code: == partition.graph.nodes[node2][reg_attr] + and partition.graph.node_data(node1)[reg_attr] + == partition.graph.node_data(node2)[reg_attr] ): - split[partition.graph.nodes[node1][reg_attr]] += 1 - split[partition.graph.nodes[node2][reg_attr]] += 1 + # frm: original code: split[partition.graph.nodes[node1][reg_attr]] += 1 + # frm: original code: split[partition.graph.nodes[node2][reg_attr]] += 1 + split[partition.graph.node_data(node1)[reg_attr]] += 1 + split[partition.graph.node_data(node2)[reg_attr]] += 1 return sum(1 for value in split.values() if value > 0) diff --git a/gerrychain/updaters/cut_edges.py b/gerrychain/updaters/cut_edges.py index 7fac766e..8cc85c4e 100644 --- a/gerrychain/updaters/cut_edges.py +++ b/gerrychain/updaters/cut_edges.py @@ -3,29 +3,29 @@ from .flows import on_edge_flow, neighbor_flips -def put_edges_into_parts(edges: List, assignment: Dict) -> Dict: + +def _put_edges_into_parts(cut_edges: List, assignment: Dict) -> Dict: """ - :param edges: A list of edges in a graph which are to be separated + :param cut_edges: A list of cut_edges in a graph which are to be separated into their respective parts within the partition according to the given assignment. - :type edges: List + :type cut_edges: List :param assignment: A dictionary mapping nodes to their respective parts within the partition. :type assignment: Dict - :returns: A dictionary mapping each part of a partition to the set of edges + :returns: A dictionary mapping each part of a partition to the set of cut_edges in that part. :rtype: Dict """ by_part = collections.defaultdict(set) - for edge in edges: + for edge in cut_edges: # add edge to the sets corresponding to the parts it touches by_part[assignment.mapping[edge[0]]].add(edge) by_part[assignment.mapping[edge[1]]].add(edge) return by_part - -def new_cuts(partition) -> Set[Tuple]: +def _new_cuts(partition) -> Set[Tuple]: """ :param partition: A partition of a Graph :type partition: :class:`~gerrychain.partition.Partition` @@ -40,7 +40,7 @@ def new_cuts(partition) -> Set[Tuple]: } -def obsolete_cuts(partition) -> Set[Tuple]: +def _obsolete_cuts(partition) -> Set[Tuple]: """ :param partition: A partition of a Graph :type partition: :class:`~gerrychain.partition.Partition` @@ -55,28 +55,48 @@ def obsolete_cuts(partition) -> Set[Tuple]: and not partition.crosses_parts((node, neighbor)) } - def initialize_cut_edges(partition): """ :param partition: A partition of a Graph :type partition: :class:`~gerrychain.partition.Partition` + frm: TODO: This description should be updated. Cut_edges are edges that touch + two different parts (districts). They are the internal boundaries + between parts (districts). This routine finds all of the cut_edges + in the graph and then creates a dict that stores all of the cut_edges + for each part (district). This dict becomes the value of + partition["cut_edges"]. + + Peter agreed: + Ah, you are correct. It maps parts to cut edges, not just any edges in the partition + + + :returns: A dictionary mapping each part of a partition to the set of edges in that part. :rtype: Dict """ - edges = { + # Compute the set of edges that are "cut_edges" - that is, edges that go from + # one part (district) to another. + cut_edges = { tuple(sorted(edge)) + # frm: edges vs edge_ids: edges are wanted here (tuples) for edge in partition.graph.edges if partition.crosses_parts(edge) } - return put_edges_into_parts(edges, partition.assignment) + return _put_edges_into_parts(cut_edges, partition.assignment) @on_edge_flow(initialize_cut_edges, alias="cut_edges_by_part") def cut_edges_by_part( partition, previous: Set[Tuple], new_edges: Set[Tuple], old_edges: Set[Tuple] ) -> Set[Tuple]: + # + # frm TODO: Update / expand the documentation for this routine. + # + # This only operates on cut-edges and not on all of the + # edges in a partition. A "cut-edge" is an edge that spans two districts. + # """ Updater function that responds to the flow of edges between different partitions. @@ -115,6 +135,6 @@ def cut_edges(partition): # Edges that weren't cut, but now are cut # We sort the tuples to make sure we don't accidentally end # up with both (4,5) and (5,4) (for example) in it - new, obsolete = new_cuts(partition), obsolete_cuts(partition) + new, obsolete = _new_cuts(partition), _obsolete_cuts(partition) return (parent["cut_edges"] | new) - obsolete diff --git a/gerrychain/updaters/election.py b/gerrychain/updaters/election.py index 2415de42..6a314cef 100644 --- a/gerrychain/updaters/election.py +++ b/gerrychain/updaters/election.py @@ -108,6 +108,38 @@ def __init__( self.updater = ElectionUpdater(self) + def _initialize_self(self, partition): + """ + Because node_ids are changed when converting from NX to RX based graphs when + we create a partition, we need to delay initialization of internal data members + that depend on node_ids until AFTER the partition has been created. That is + because we don't know how to map original node_ids to internal node_ids until + the partition is created. + + Note that the fact that node_ids have changed is hidden by the fact that + """ + + # frm: TODO: Clean this up... + # + # This is a mess - I am going to reset to the original code and make + # 100% sure I grok what is happening... + + """ + Convert _original_parties_to_columns to use internal_ids + + internal_parties_to_columns = ??? translate original node_ids... + ??? handle the case when instead of a dict it is a list + + Then just use the code from before, but with new node_ids + """ + + # Compute totals for each "party" => dict of form: {part: sum} + # where "part" is a district in partition + self.tallies = { + party: DataTally(self.parties_to_columns[party], party) + for party in self.parties + } + def __str__(self): return "Election '{}' with vote totals for parties {} from columns {}.".format( self.name, str(self.parties), str(self.columns) @@ -167,6 +199,10 @@ def get_previous_values(self, partition) -> Dict[str, Dict[int, float]]: return previous_totals_for_party +# frm: TODO: This routine, get_percents(), is only ever used inside ElectionResults. +# +# Why is it not defined as an internal function inside ElectionResults? +# def get_percents(counts: Dict, totals: Dict) -> Dict: """ :param counts: A dictionary mapping each part in a partition to the diff --git a/gerrychain/updaters/flows.py b/gerrychain/updaters/flows.py index bf00096b..7f6706e5 100644 --- a/gerrychain/updaters/flows.py +++ b/gerrychain/updaters/flows.py @@ -2,6 +2,9 @@ import functools from typing import Dict, Set, Tuple, Callable +# frm: TODO: This file needs documentation / comments!!! +# +# Peter agrees... @functools.lru_cache(maxsize=2) def neighbor_flips(partition) -> Set[Tuple]: @@ -129,18 +132,40 @@ def compute_edge_flows(partition) -> Dict: new_source = assignment.mapping[node] new_target = assignment.mapping[neighbor] - cut = new_source != new_target - was_cut = old_source != old_target + # frm: Clarification to myself... + # A "cut edge" is one where the nodes in the edge are assigned to different + # districts. So, how does a flip change whether an edge is a cut edge? There + # are three possibilities: 1) the edge goes from not being a cut edge to being + # a cut edge, 2) the edge goes from being a cut edge to not being a cut edge, + # and 3) the edge was a cut edge before and is still a cut edge after the flip, + # but the partition assignments to one or the other nodes in the edge changes. + # + # That is what the if-stmt below is doing - determining which of the three + # cases each flip falls into. It updates the flows accordingly... + # + cut = new_source != new_target # after flip, the edge is a cut edge + was_cut = old_source != old_target # before flip, the edge was a cut edge if not cut and was_cut: + # was a cut edge before, but now is not, so flows out of both edge_flows[old_target]["out"].add(edge) edge_flows[old_source]["out"].add(edge) elif cut and not was_cut: + # was not a cut edge before, but now is, so flows into both edge_flows[new_target]["in"].add(edge) edge_flows[new_source]["in"].add(edge) elif cut and was_cut: # If an edge was cut and still is cut, we need to make sure the # edge is listed under the correct parts. + # frm: Clarification to myself... Python set subtraction will delete + # from the set on the left any members of the set on the right, + # so no_longer_incident_parts will determine if either old_target, + # or old_source has changed - that is, whether the assignment of + # the one of the old mappings has changed - if so, the edge has + # gone "out" of that partition. If you do the subtraction the + # other way, you find whether the new mappings have changed + # and you can then update the "in" flows + # no_longer_incident_parts = {old_target, old_source} - { new_target, new_source, @@ -151,6 +176,7 @@ def compute_edge_flows(partition) -> Dict: newly_incident_parts = {new_target, new_source} - {old_target, old_source} for part in newly_incident_parts: edge_flows[part]["in"].add(edge) + return edge_flows diff --git a/gerrychain/updaters/locality_split_scores.py b/gerrychain/updaters/locality_split_scores.py index 28720b2f..130df528 100644 --- a/gerrychain/updaters/locality_split_scores.py +++ b/gerrychain/updaters/locality_split_scores.py @@ -1,9 +1,19 @@ # Imports from collections import defaultdict, Counter +# frm TODO: Remove dependence on NetworkX. +# The only use is: +# pieces += nx.number_connected_components(subgraph) import networkx as nx import math from typing import List +# frm: TODO: Do performance testing and improve performance of these routines. +# +# Peter made the comment in a PR that we should make this code more efficient: +# +# A note on this file: A ton of the code in here is inefficient. This was +# made 6 years ago and hasn't really been touched since then other than +# when I was doing an overhaul on many of the doc strings class LocalitySplits: """ @@ -134,8 +144,31 @@ def __init__( def __call__(self, partition): + # frm: TODO: LocalitySplits: Figure out how this is intended to be used... + # + # Not quite sure why it is better to have a "__call()__" method instead of a + # get_scores(self) method, but whatever... + # + # This routine indeed just computes the requested scores (specified in the constructor). + # It stashed those scores as a data member in the class and returns them to the caller as well. + # + # This all seems kind of misguided to me - and there is no instance of this being used in + # the gerrychain code except in a test, so I am not sure how it is intended to be used. + # + # Probably need to look at some user code that Peter sent me to see if anyone actually uses + # this and if so, how... + # + if self.localities == []: - self.localitydict = dict(partition.graph.nodes(data=self.col_id)) + # frm: TODO: NX vs. RX issues here. graph.nodes(data=) is NX specific... + + # frm: Original code: + # self.localitydict = dict(partition.graph.nodes(data=self.col_id)) + # + self.localitydict = {} + for node_id in partition.graph.node_indices: + self.localitydict[node_id] = partition.graph.node_data(node_id)[self.col_id] + self.localities = set(list(self.localitydict.values())) locality_splits = { @@ -154,23 +187,73 @@ def __call__(self, partition): allowed_pieces = {} totpop = 0 - for node in partition.graph.nodes: - totpop += partition.graph.nodes[node][self.pop_col] + for node in partition.graph.node_indices: + # frm: TODO: Once you have a partition, you cannot change the total population + # in the Partition, so why don't we cache the total population as + # a data member in Partition? + # + # Peter agreed that this would be a good thing to do + + # frm: original code: totpop += partition.graph.nodes[node][self.pop_col] + totpop += partition.graph.node_data(node)[self.pop_col] + + # frm: TODO: Ditto with num_districts - isn't this a constant once you create a Partition? + # + # Peter agreed that this would be a good thing to do. num_districts = len(partition.assignment.parts.keys()) + # Compute the total population for each locality and then the number of "allowed pieces" for loc in self.localities: - sg = partition.graph.subgraph( - n - for n, v in partition.graph.nodes(data=True) - if v[self.col_id] == loc - ) - - pop = 0 - for n in sg.nodes(): - pop += sg.nodes[n][self.pop_col] + # frm: TODO: The code below just calculates the total population for a set of nodes. + # This sounds like a good candidate for a utility function. See if this + # logic is repeated elsewhere... + + # frm: I changed the original code for a couple of reasons: + # + # * There were NX depedencies in the original code. + # partition.graph.nodes(data=True) + # * Creating a subgraph just to get a subset of nodes seemed unnecessary + # and probably expensive. + # * I found the code dense and it took me too long to figure out what it did. + + # frm: Original Code: + # + # sg = partition.graph.subgraph( + # for n, v in partition.graph.nodes(data=True) + # if v[self.col_id] == loc + # ) + # + # pop = 0 + # for n in sg.nodes(): + # # frm: TODO: I think this needs to change to work for RX... + # pop += sg.nodes[n][self.pop_col] + # + # allowed_pieces[loc] = math.ceil(pop / (totpop / num_districts)) + + # frm: new version of this code that is less clever... + + # Compute the population associated with each location + the_graph = partition.graph + locality_population = {} # dict mapping locality name to population in that locality + for node_id in the_graph.node_indices: + locality_name = the_graph.node_data(node_id)[self.col_id] + locality_pop = the_graph.node_data(node_id)[self.pop_col] + if locality_name not in locality_population: + locality_population[locality_name] = locality_pop + else: + locality_population[locality_name] += locality_pop + + # frm: TODO: Peter commented (in PR) that this is another thing that + # could be cached so we didn't recompute it over and over... + ideal_population_per_district = totpop / num_districts + + # Compute the number of "allowed pieces" for each locality + allowed_pieces = {} + for locality_name in locality_population.keys(): + pop_for_locality = locality_population[locality_name] + allowed_pieces[locality_name] = math.ceil(pop_for_locality / ideal_population_per_district) - allowed_pieces[loc] = math.ceil(pop / (totpop / num_districts)) self.allowed_pieces = allowed_pieces for s in self.scores: @@ -227,8 +310,9 @@ def num_pieces(self, partition) -> int: """ locality_intersections = {} - for n in partition.graph.nodes(): - locality = partition.graph.nodes[n][self.col_id] + for n in partition.graph.node_indices: + # frm: original code: locality = partition.graph.nodes[n][self.col_id] + locality = partition.graph.node_data(n)[self.col_id] if locality not in locality_intersections: locality_intersections[locality] = set( [partition.assignment.mapping[n]] @@ -243,11 +327,15 @@ def num_pieces(self, partition) -> int: [ x for x in partition.parts[d] - if partition.graph.nodes[x][self.col_id] == locality + # frm: original code: if partition.graph.nodes[x][self.col_id] == locality + if partition.graph.node_data(x)[self.col_id] == locality ] ) - pieces += nx.number_connected_components(subgraph) + # frm: Original Code: + # + # pieces += nx.number_connected_components(subgraph) + pieces += subgraph.num_connected_components() return pieces def naked_boundary(self, partition) -> int: @@ -380,7 +468,10 @@ def symmetric_entropy(self, partition) -> float: # IN PROGRESS vtds = district_dict[district] locality_pop = {k: 0 for k in self.localities} for vtd in vtds: - locality_pop[self.localitydict[vtd]] += partition.graph.nodes[vtd][ + # frm: original code: locality_pop[self.localitydict[vtd]] += partition.graph.nodes[vtd][ + # frm: original code: self.pop_col + # frm: original code: ] + locality_pop[self.localitydict[vtd]] += partition.graph.node_data(vtd)[ self.pop_col ] district_dict[district] = locality_pop diff --git a/gerrychain/updaters/spanning_trees.py b/gerrychain/updaters/spanning_trees.py index 307daf40..2f6cddce 100644 --- a/gerrychain/updaters/spanning_trees.py +++ b/gerrychain/updaters/spanning_trees.py @@ -4,7 +4,6 @@ import math import numpy -import networkx from typing import Dict @@ -25,7 +24,8 @@ def _num_spanning_trees_in_district(partition, district: int) -> int: :rtype: int """ graph = partition.subgraphs[district] - laplacian = networkx.laplacian_matrix(graph) + # frm: Original Code: laplacian = networkx.laplacian_matrix(graph) + laplacian = partition.graph.laplacian_matrix() L = numpy.delete(numpy.delete(laplacian.todense(), 0, 0), 1, 1) return math.exp(numpy.linalg.slogdet(L)[1]) diff --git a/gerrychain/updaters/tally.py b/gerrychain/updaters/tally.py index 97305b38..9b9a6d5a 100644 --- a/gerrychain/updaters/tally.py +++ b/gerrychain/updaters/tally.py @@ -35,23 +35,62 @@ def __init__(self, data: Union[Dict, pandas.Series, str], alias: str) -> None: self.alias = alias def initialize_tally(partition): + + # If the "data" passed in was a string, then interpret that string + # as the name of a node attribute in the graph, and construct + # a dict of the form: {node_id: node_attribution_value} + # + # If not, then assume that the "data" passed in is already of the + # form: {node_id: data_value} + + # frm: TODO: Verify that if the "data" passed in is not a string that it + # is of the form: {node_id, data_value} + + # frm: TODO: If self.data is a dict: {node: votes} then check if original node_ids + # + # This came up with Election udpaters - if you specify the data in an explicit + # dict of {node: votes}, then things get screwed up because at the time you create + # the Election object, the partition has not yet been created, so the node_ids are + # original node_ids which are not appropriate after the partition has been created + # and the new RX graph has new node_ids. + # + # In the Election updater case, the fix would be to delay the initial tally to + # happen AFTER the partition is created and to at some point before doing the + # initial tally, translate the original node_ids to be internal RX node_ids. + # + # However, I am wondering if this problem is a general problem with tallies + # made by other updaters. Stated differently, is it safe to assume that an + # explicit dict of {node_id: votes} is ALWAYS done with original node_ids in all + # cases of the use of tallies? + # + # => What other code uses Tally? + if isinstance(self.data, str): - nodes = partition.graph.nodes - attribute = self.data - self.data = {node: nodes[node][attribute] for node in nodes} + # frm: Original Code: + # nodes = partition.graph.nodes + # attribute = self.data + # self.data = {node: nodes[node][attribute] for node in nodes} + graph = partition.graph + node_ids = partition.graph.node_indices + attribute = self.data + self.data = {node_id: graph.node_data(node_id)[attribute] for node_id in node_ids} + tally = collections.defaultdict(int) - for node, part in partition.assignment.items(): - add = self.data[node] + for node_id, part in partition.assignment.items(): + add = self.data[node_id] + # frm: TODO: Should I also test that the "add" variable is a number or something + # that can be added? if math.isnan(add): warnings.warn( - "ignoring nan encountered at node '{}' for attribute '{}'".format( - node, self.alias + "ignoring nan encountered at node_id '{}' for attribute '{}'".format( + node_id, self.alias ) ) else: tally[part] += add + return dict(tally) @on_flow(initialize_tally, alias=alias) diff --git a/tests/_foo/do_laplacian.py b/tests/_foo/do_laplacian.py new file mode 100644 index 00000000..40fb6601 --- /dev/null +++ b/tests/_foo/do_laplacian.py @@ -0,0 +1,46 @@ + +import networkx as nx +import rustworkx as rx +import numpy as np +from graph import Graph +import tree as gc_tree + +# Create an RX graph (replace with your graph data) +rx_graph = rx.PyGraph() +rx_graph.add_nodes_from([0, 1, 2, 3]) +rx_graph.add_edges_from([(0, 1, "data"), (0, 2, "data"), (1, 2, "data"), (2, 3, "data")]) + +# 1. Get the adjacency matrix +adj_matrix = rx.adjacency_matrix(rx_graph) + +# 2. Calculate the degree matrix (simplified for this example) +degree_matrix = np.diag([rx_graph.degree(node) for node in rx_graph.node_indices()]) + +# 3. Calculate the Laplacian matrix +rx_laplacian_matrix = degree_matrix - adj_matrix + +print("RX Adjacency Matrix:") +print(adj_matrix) + +print("\nRX Degree Matrix:") +print(degree_matrix) + +print("\nRX Laplacian Matrix:") +print(rx_laplacian_matrix) + +print("type of RX laplacian_matrix is: ", type(rx_laplacian_matrix)) + +# Create an NX graph (replace with your graph data) +nx_graph = nx.Graph([(0, 1), (0, 2), (1, 2), (2, 3)]) +nx_laplacian_matrix = nx.laplacian_matrix(nx_graph) + +print("\nNX Laplacian Matrix:") +print(nx_laplacian_matrix) + +print("type of NX laplacian_matrix is: ", type(nx_laplacian_matrix)) + +gc_nx_graph = Graph.from_nx_graph(nx_graph) +gc_rx_graph = Graph.from_rx_graph(rx_graph) + +print("\ngc_laplacian(nx_graph) is: ", gctree.gc_laplacian_matrix(gc_nx_graph)) +print("\ngc_laplacian(rx_graph) is: ", gctree.gc_laplacian_matrix(gc_rx_graph)) diff --git a/tests/conftest.py b/tests/conftest.py index 501906ab..e23125f8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,8 +15,8 @@ def three_by_three_grid(): 3 4 5 6 7 8 """ - graph = Graph() - graph.add_edges_from( + nx_graph = nx.Graph() + nx_graph.add_edges_from( [ (0, 1), (0, 3), @@ -32,8 +32,7 @@ def three_by_three_grid(): (7, 8), ] ) - return graph - + return Graph.from_networkx(nx_graph) @pytest.fixture def four_by_five_grid_for_opt(): @@ -47,8 +46,8 @@ def four_by_five_grid_for_opt(): # 5 6 7 8 9 # 0 1 2 3 4 - graph = Graph() - graph.add_nodes_from( + nx_graph = nx.Graph() + nx_graph.add_nodes_from( [ (0, {"population": 10, "opt_value": 1, "MVAP": 2}), (1, {"population": 10, "opt_value": 1, "MVAP": 2}), @@ -73,7 +72,7 @@ def four_by_five_grid_for_opt(): ] ) - graph.add_edges_from( + nx_graph.add_edges_from( [ (0, 1), (0, 5), @@ -109,26 +108,36 @@ def four_by_five_grid_for_opt(): ] ) - return graph + return Graph.from_networkx(nx_graph) @pytest.fixture def graph_with_random_data_factory(three_by_three_grid): + def factory(columns): graph = three_by_three_grid attach_random_data(graph, columns) return graph + # A closure - will add random data (int) to all nodes for each named "column" return factory +# frm: TODO: This routine is only ever used immediately above in def factory(columns). +# Is it part of the external API? If not, then it should be moved inside +# the graph_with_random_data_factory() routine def attach_random_data(graph, columns): for node in graph.nodes: for col in columns: - graph.nodes[node][col] = random.randint(1, 1000) + # frm: Original code: graph.nodes[node][col] = random.randint(1, 1000) + graph.node_data(node)[col] = random.randint(1, 1000) @pytest.fixture +# frm: ???: Why not just always use three_by_three_grid? At least that gives +# the reader an idea of how many nodes there are? What is the +# value of just having a generic "graph" test fixture??? +# def graph(three_by_three_grid): return three_by_three_grid diff --git a/tests/constraints/test_contiguity.py b/tests/constraints/test_contiguity.py index b94f1b5d..e40d1e77 100644 --- a/tests/constraints/test_contiguity.py +++ b/tests/constraints/test_contiguity.py @@ -3,6 +3,7 @@ def test_contiguous_components(graph): + partition = Partition(graph, {0: 1, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 1, 7: 1, 8: 1}) components = contiguous_components(partition) @@ -10,9 +11,22 @@ def test_contiguous_components(graph): assert len(components[1]) == 2 assert len(components[2]) == 1 - assert set(frozenset(g.nodes) for g in components[1]) == { + # frm: Original Code: + # + # assert set(frozenset(g.nodes) for g in components[1]) == { + # frozenset([0, 1, 2]), + # frozenset([6, 7, 8]), + # } + # assert set(components[2][0].nodes) == {3, 4, 5} + + # Confirm that the appropriate connected subgraphs were found. Note that we need + # to compare against the original node_ids, since RX node_ids change every time + # you create a subgraph. + + assert set(frozenset(g.original_node_ids_for_set(g.nodes)) for g in components[1]) == { frozenset([0, 1, 2]), frozenset([6, 7, 8]), } - - assert set(components[2][0].nodes) == {3, 4, 5} + assert set(frozenset(g.original_node_ids_for_set(g.nodes)) for g in components[2]) == { + frozenset([3, 4, 5]), + } diff --git a/tests/constraints/test_validity.py b/tests/constraints/test_validity.py index 7bc3e01d..1c915683 100644 --- a/tests/constraints/test_validity.py +++ b/tests/constraints/test_validity.py @@ -55,6 +55,7 @@ def test_contiguous_with_contiguity_no_flips_is_true(contiguous_partition): def test_contiguous_with_contiguity_flips_is_true(contiguous_partition_with_flips): contiguous_partition, test_flips = contiguous_partition_with_flips + # frm: TODO: Figure out whether test_flips are in original node_ids or internal RX node_ids contiguous_partition2 = contiguous_partition.flip(test_flips) assert contiguous(contiguous_partition2) assert single_flip_contiguous(contiguous_partition2) @@ -79,6 +80,7 @@ def test_discontiguous_with_contiguous_flips_is_false( discontiguous_partition_with_flips ): part, test_flips = discontiguous_partition_with_flips + # frm: TODO: Figure out whether test_flips are in original node_ids or internal RX node_ids discontiguous_partition2 = part.flip(test_flips) assert not contiguous(discontiguous_partition2) @@ -91,6 +93,7 @@ def test_discontiguous_with_single_flip_contiguous_flips_is_false( discontiguous_partition_with_flips ): part, test_flips = discontiguous_partition_with_flips + # frm: TODO: Figure out whether test_flips are in original node_ids or internal RX node_ids discontiguous_partition2 = part.flip(test_flips) assert not single_flip_contiguous(discontiguous_partition2) @@ -99,6 +102,7 @@ def test_discontiguous_with_contiguous_bfs_flips_is_false( discontiguous_partition_with_flips ): part, test_flips = discontiguous_partition_with_flips + # frm: TODO: Figure out whether test_flips are in original node_ids or internal RX node_ids discontiguous_partition2 = part.flip(test_flips) assert not contiguous_bfs(discontiguous_partition2) diff --git a/tests/frm_tests/README.txt b/tests/frm_tests/README.txt new file mode 100644 index 00000000..037dbec1 --- /dev/null +++ b/tests/frm_tests/README.txt @@ -0,0 +1,6 @@ +This directory contains tests added by Fred Mueller +for the work he is doing / did to convert GerryChain +from using NetworkX to using RustworkX. + +Eventually if his code becomes the new thing, these +tests should be rolled into the normal tests directory. diff --git a/tests/frm_tests/__init__.py b/tests/frm_tests/__init__.py new file mode 100644 index 00000000..b0fefc57 --- /dev/null +++ b/tests/frm_tests/__init__.py @@ -0,0 +1,2 @@ + +print("__init__.py invoked") diff --git a/tests/frm_tests/frm_regression_test.README.txt b/tests/frm_tests/frm_regression_test.README.txt new file mode 100644 index 00000000..a1051154 --- /dev/null +++ b/tests/frm_tests/frm_regression_test.README.txt @@ -0,0 +1,12 @@ +I created a regression test based on the User Guide code so that +I could make changes and quickly test whether they affected +user code. + +The 3 files that I added are: + + * frm_regression_test.py + * Code copied from the User Guide + * gerrymandria.json + * JSON for the graph used in the regression test + * frm_regression_test.README.txt + * This file diff --git a/tests/frm_tests/gerrymandria.json b/tests/frm_tests/gerrymandria.json new file mode 100644 index 00000000..a6ca2fae --- /dev/null +++ b/tests/frm_tests/gerrymandria.json @@ -0,0 +1,1641 @@ +{ + "directed": false, + "multigraph": false, + "graph": [], + "nodes": [ + { + "TOTPOP": 1, + "x": 0, + "y": 0, + "county": "1", + "district": "1", + "precinct": 0, + "muni": "1", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 0 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 1, + "county": "1", + "district": "1", + "precinct": 1, + "muni": "1", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 1 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 2, + "county": "1", + "district": "1", + "precinct": 2, + "muni": "5", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 2 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 3, + "county": "1", + "district": "1", + "precinct": 3, + "muni": "5", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 3 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 4, + "county": "3", + "district": "1", + "precinct": 4, + "muni": "9", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 4 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 5, + "county": "3", + "district": "1", + "precinct": 5, + "muni": "9", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 5 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 6, + "county": "3", + "district": "1", + "precinct": 6, + "muni": "13", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 6 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 7, + "county": "3", + "district": "1", + "precinct": 7, + "muni": "13", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 7 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 0, + "county": "1", + "district": "2", + "precinct": 8, + "muni": "1", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 8 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 1, + "county": "1", + "district": "2", + "precinct": 9, + "muni": "1", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 9 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 2, + "county": "1", + "district": "2", + "precinct": 10, + "muni": "5", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 10 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 3, + "county": "1", + "district": "2", + "precinct": 11, + "muni": "5", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 11 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 4, + "county": "3", + "district": "2", + "precinct": 12, + "muni": "9", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 12 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 5, + "county": "3", + "district": "2", + "precinct": 13, + "muni": "9", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 13 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 6, + "county": "3", + "district": "2", + "precinct": 14, + "muni": "13", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 14 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 7, + "county": "3", + "district": "2", + "precinct": 15, + "muni": "13", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 15 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 0, + "county": "1", + "district": "3", + "precinct": 16, + "muni": "2", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 16 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 1, + "county": "1", + "district": "3", + "precinct": 17, + "muni": "2", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 17 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 2, + "county": "1", + "district": "3", + "precinct": 18, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 18 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 3, + "county": "1", + "district": "3", + "precinct": 19, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 19 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 4, + "county": "3", + "district": "3", + "precinct": 20, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 20 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 5, + "county": "3", + "district": "3", + "precinct": 21, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 21 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 6, + "county": "3", + "district": "3", + "precinct": 22, + "muni": "14", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 22 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 7, + "county": "3", + "district": "3", + "precinct": 23, + "muni": "14", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 23 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 0, + "county": "1", + "district": "4", + "precinct": 24, + "muni": "2", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 24 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 1, + "county": "1", + "district": "4", + "precinct": 25, + "muni": "2", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 25 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 2, + "county": "1", + "district": "4", + "precinct": 26, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 26 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 3, + "county": "1", + "district": "4", + "precinct": 27, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 27 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 4, + "county": "3", + "district": "4", + "precinct": 28, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 28 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 5, + "county": "3", + "district": "4", + "precinct": 29, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 29 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 6, + "county": "3", + "district": "4", + "precinct": 30, + "muni": "14", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 30 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 7, + "county": "3", + "district": "4", + "precinct": 31, + "muni": "14", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 31 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 0, + "county": "2", + "district": "5", + "precinct": 32, + "muni": "3", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 32 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 1, + "county": "2", + "district": "5", + "precinct": 33, + "muni": "3", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 33 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 2, + "county": "2", + "district": "5", + "precinct": 34, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 34 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 3, + "county": "2", + "district": "5", + "precinct": 35, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 35 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 4, + "county": "4", + "district": "5", + "precinct": 36, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 36 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 5, + "county": "4", + "district": "5", + "precinct": 37, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 37 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 6, + "county": "4", + "district": "5", + "precinct": 38, + "muni": "15", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 38 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 7, + "county": "4", + "district": "5", + "precinct": 39, + "muni": "15", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 39 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 0, + "county": "2", + "district": "6", + "precinct": 40, + "muni": "3", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 40 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 1, + "county": "2", + "district": "6", + "precinct": 41, + "muni": "3", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 41 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 2, + "county": "2", + "district": "6", + "precinct": 42, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 42 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 3, + "county": "2", + "district": "6", + "precinct": 43, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 43 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 4, + "county": "4", + "district": "6", + "precinct": 44, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 44 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 5, + "county": "4", + "district": "6", + "precinct": 45, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 45 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 6, + "county": "4", + "district": "6", + "precinct": 46, + "muni": "15", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 46 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 7, + "county": "4", + "district": "6", + "precinct": 47, + "muni": "15", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 47 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 0, + "county": "2", + "district": "7", + "precinct": 48, + "muni": "4", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 48 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 1, + "county": "2", + "district": "7", + "precinct": 49, + "muni": "4", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 49 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 2, + "county": "2", + "district": "7", + "precinct": 50, + "muni": "8", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 50 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 3, + "county": "2", + "district": "7", + "precinct": 51, + "muni": "8", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 51 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 4, + "county": "4", + "district": "7", + "precinct": 52, + "muni": "12", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 52 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 5, + "county": "4", + "district": "7", + "precinct": 53, + "muni": "12", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 53 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 6, + "county": "4", + "district": "7", + "precinct": 54, + "muni": "16", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 54 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 7, + "county": "4", + "district": "7", + "precinct": 55, + "muni": "16", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 55 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 0, + "county": "2", + "district": "8", + "precinct": 56, + "muni": "4", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 56 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 1, + "county": "2", + "district": "8", + "precinct": 57, + "muni": "4", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 57 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 2, + "county": "2", + "district": "8", + "precinct": 58, + "muni": "8", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 58 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 3, + "county": "2", + "district": "8", + "precinct": 59, + "muni": "8", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 59 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 4, + "county": "4", + "district": "8", + "precinct": 60, + "muni": "12", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 60 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 5, + "county": "4", + "district": "8", + "precinct": 61, + "muni": "12", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 61 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 6, + "county": "4", + "district": "8", + "precinct": 62, + "muni": "16", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 62 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 7, + "county": "4", + "district": "8", + "precinct": 63, + "muni": "16", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 63 + } + ], + "adjacency": [ + [ + { + "id": 8 + }, + { + "id": 1 + } + ], + [ + { + "id": 0 + }, + { + "id": 9 + }, + { + "id": 2 + } + ], + [ + { + "id": 1 + }, + { + "id": 10 + }, + { + "id": 3 + } + ], + [ + { + "id": 2 + }, + { + "id": 11 + }, + { + "id": 4 + } + ], + [ + { + "id": 3 + }, + { + "id": 12 + }, + { + "id": 5 + } + ], + [ + { + "id": 4 + }, + { + "id": 13 + }, + { + "id": 6 + } + ], + [ + { + "id": 5 + }, + { + "id": 14 + }, + { + "id": 7 + } + ], + [ + { + "id": 6 + }, + { + "id": 15 + } + ], + [ + { + "id": 0 + }, + { + "id": 16 + }, + { + "id": 9 + } + ], + [ + { + "id": 1 + }, + { + "id": 8 + }, + { + "id": 17 + }, + { + "id": 10 + } + ], + [ + { + "id": 2 + }, + { + "id": 9 + }, + { + "id": 18 + }, + { + "id": 11 + } + ], + [ + { + "id": 3 + }, + { + "id": 10 + }, + { + "id": 19 + }, + { + "id": 12 + } + ], + [ + { + "id": 4 + }, + { + "id": 11 + }, + { + "id": 20 + }, + { + "id": 13 + } + ], + [ + { + "id": 5 + }, + { + "id": 12 + }, + { + "id": 21 + }, + { + "id": 14 + } + ], + [ + { + "id": 6 + }, + { + "id": 13 + }, + { + "id": 22 + }, + { + "id": 15 + } + ], + [ + { + "id": 7 + }, + { + "id": 14 + }, + { + "id": 23 + } + ], + [ + { + "id": 8 + }, + { + "id": 24 + }, + { + "id": 17 + } + ], + [ + { + "id": 9 + }, + { + "id": 16 + }, + { + "id": 25 + }, + { + "id": 18 + } + ], + [ + { + "id": 10 + }, + { + "id": 17 + }, + { + "id": 26 + }, + { + "id": 19 + } + ], + [ + { + "id": 11 + }, + { + "id": 18 + }, + { + "id": 27 + }, + { + "id": 20 + } + ], + [ + { + "id": 12 + }, + { + "id": 19 + }, + { + "id": 28 + }, + { + "id": 21 + } + ], + [ + { + "id": 13 + }, + { + "id": 20 + }, + { + "id": 29 + }, + { + "id": 22 + } + ], + [ + { + "id": 14 + }, + { + "id": 21 + }, + { + "id": 30 + }, + { + "id": 23 + } + ], + [ + { + "id": 15 + }, + { + "id": 22 + }, + { + "id": 31 + } + ], + [ + { + "id": 16 + }, + { + "id": 32 + }, + { + "id": 25 + } + ], + [ + { + "id": 17 + }, + { + "id": 24 + }, + { + "id": 33 + }, + { + "id": 26 + } + ], + [ + { + "id": 18 + }, + { + "id": 25 + }, + { + "id": 34 + }, + { + "id": 27 + } + ], + [ + { + "id": 19 + }, + { + "id": 26 + }, + { + "id": 35 + }, + { + "id": 28 + } + ], + [ + { + "id": 20 + }, + { + "id": 27 + }, + { + "id": 36 + }, + { + "id": 29 + } + ], + [ + { + "id": 21 + }, + { + "id": 28 + }, + { + "id": 37 + }, + { + "id": 30 + } + ], + [ + { + "id": 22 + }, + { + "id": 29 + }, + { + "id": 38 + }, + { + "id": 31 + } + ], + [ + { + "id": 23 + }, + { + "id": 30 + }, + { + "id": 39 + } + ], + [ + { + "id": 24 + }, + { + "id": 40 + }, + { + "id": 33 + } + ], + [ + { + "id": 25 + }, + { + "id": 32 + }, + { + "id": 41 + }, + { + "id": 34 + } + ], + [ + { + "id": 26 + }, + { + "id": 33 + }, + { + "id": 42 + }, + { + "id": 35 + } + ], + [ + { + "id": 27 + }, + { + "id": 34 + }, + { + "id": 43 + }, + { + "id": 36 + } + ], + [ + { + "id": 28 + }, + { + "id": 35 + }, + { + "id": 44 + }, + { + "id": 37 + } + ], + [ + { + "id": 29 + }, + { + "id": 36 + }, + { + "id": 45 + }, + { + "id": 38 + } + ], + [ + { + "id": 30 + }, + { + "id": 37 + }, + { + "id": 46 + }, + { + "id": 39 + } + ], + [ + { + "id": 31 + }, + { + "id": 38 + }, + { + "id": 47 + } + ], + [ + { + "id": 32 + }, + { + "id": 48 + }, + { + "id": 41 + } + ], + [ + { + "id": 33 + }, + { + "id": 40 + }, + { + "id": 49 + }, + { + "id": 42 + } + ], + [ + { + "id": 34 + }, + { + "id": 41 + }, + { + "id": 50 + }, + { + "id": 43 + } + ], + [ + { + "id": 35 + }, + { + "id": 42 + }, + { + "id": 51 + }, + { + "id": 44 + } + ], + [ + { + "id": 36 + }, + { + "id": 43 + }, + { + "id": 52 + }, + { + "id": 45 + } + ], + [ + { + "id": 37 + }, + { + "id": 44 + }, + { + "id": 53 + }, + { + "id": 46 + } + ], + [ + { + "id": 38 + }, + { + "id": 45 + }, + { + "id": 54 + }, + { + "id": 47 + } + ], + [ + { + "id": 39 + }, + { + "id": 46 + }, + { + "id": 55 + } + ], + [ + { + "id": 40 + }, + { + "id": 56 + }, + { + "id": 49 + } + ], + [ + { + "id": 41 + }, + { + "id": 48 + }, + { + "id": 57 + }, + { + "id": 50 + } + ], + [ + { + "id": 42 + }, + { + "id": 49 + }, + { + "id": 58 + }, + { + "id": 51 + } + ], + [ + { + "id": 43 + }, + { + "id": 50 + }, + { + "id": 59 + }, + { + "id": 52 + } + ], + [ + { + "id": 44 + }, + { + "id": 51 + }, + { + "id": 60 + }, + { + "id": 53 + } + ], + [ + { + "id": 45 + }, + { + "id": 52 + }, + { + "id": 61 + }, + { + "id": 54 + } + ], + [ + { + "id": 46 + }, + { + "id": 53 + }, + { + "id": 62 + }, + { + "id": 55 + } + ], + [ + { + "id": 47 + }, + { + "id": 54 + }, + { + "id": 63 + } + ], + [ + { + "id": 48 + }, + { + "id": 57 + } + ], + [ + { + "id": 49 + }, + { + "id": 56 + }, + { + "id": 58 + } + ], + [ + { + "id": 50 + }, + { + "id": 57 + }, + { + "id": 59 + } + ], + [ + { + "id": 51 + }, + { + "id": 58 + }, + { + "id": 60 + } + ], + [ + { + "id": 52 + }, + { + "id": 59 + }, + { + "id": 61 + } + ], + [ + { + "id": 53 + }, + { + "id": 60 + }, + { + "id": 62 + } + ], + [ + { + "id": 54 + }, + { + "id": 61 + }, + { + "id": 63 + } + ], + [ + { + "id": 55 + }, + { + "id": 62 + } + ] + ] +} \ No newline at end of file diff --git a/tests/frm_tests/nx_rx_play.py b/tests/frm_tests/nx_rx_play.py new file mode 100644 index 00000000..283b99ea --- /dev/null +++ b/tests/frm_tests/nx_rx_play.py @@ -0,0 +1,126 @@ +####################################################### +# frm: Overview of test_frm_nx_rx_graph.py +# +# This test exists to test how NX and RX differ. +# +# It will probably evolve into a way to test whether stuff +# works the same in the new Graph object with NX vs. RX +# under the covers... +# +# + +# frm TODO: Convert this into a pytest format... + +import matplotlib.pyplot as plt +from gerrychain import (Partition, Graph, MarkovChain, + updaters, constraints, accept) +from gerrychain.proposals import recom +from gerrychain.constraints import contiguous +from functools import partial +import pandas + +import os +import rustworkx as rx +import networkx as nx + +import pytest + + +# Set the random seed so that the results are reproducible! +import random +random.seed(2024) + +# Create NX and RX Graph objects with their underlying NX and RX graphs + +# Get path to the JSON containing graph data +test_file_path = os.path.abspath(__file__) +cur_directory = os.path.dirname(test_file_path) +path_for_json_file = os.path.join(cur_directory, "gerrymandria.json") +# print("json file is: ", json_file_path) + +# Create an NX based Graph object from the JSON +gerrychain_nx_graph = Graph.from_json(path_for_json_file) + +# Fetch the NX graph object from inside the Graph object +nx_graph = gerrychain_nx_graph.get_nx_graph() + +# Create an RX graph object from NX and set node type to be a dictionary to preserve data attributes +rx_graph = rx.networkx_converter(nx_graph, keep_attributes=True) + +# Create a Graph object with an RX graph inside +gerrychain_rx_graph = Graph.from_rustworkx(rx_graph) + +# frm: ???: TODO: The set(rx_graph.nodes()) fails because it returns dictionaries which Python does not like... +# nx_set_of_nodes = set(nx_graph.nodes()) +# print("len nx_set_of_nodes is: ", len(nx_set_of_nodes)) +# rx_set_of_nodes = set(rx_graph.nodes()) +# print("len rx_set_of_nodes is: ", len(rx_set_of_nodes)) +# print("NX nodes: ", nx_set_of_nodes) +# print("RX nodes: ", rx_set_of_nodes) + +print("Testing node data dict") +print("NX data dict for node 1: ", gerrychain_nx_graph.node_data(1)) +print("RX data dict for node 1: ", gerrychain_rx_graph.node_data(1)) + +""" +Stuff to figure out / test: + * graph data - that is, data on the graph itself. + * NX + graph = nx.Graph(day="Friday") + graph['day'] = "Monday" + * RX + graph = rx.PyGraph(attrs=dict(day="Friday")) + graph.attrs['day'] = "Monday" + * graph.nodes + * NX + This is a NodeView: + nodes[x] gives dict for data for the node + * RX + RX does not have this. Instead it has nodes() which just returns + a list/set of the node indices. + Actually, a node in RX can be any Python object, but in particular, it + can be a dictionary. In my test case, I think the node in the graph is just + an integer, but it should instead be a dict. Can the value of one node differ + from that of another node? That is, can you have a graph where the nodes are + of different types? This would semantically make no sense, but maybe it is + possible. + What is nice about this is that graph[node_id] in RX will be the data for the + node - in our case a dictionary. So the syntax to access a node's data dictionary + will be different, but both work: + + NX: graph.nodes[node_id] + RX: graph[node_id] + + * Comments: + The code had graph.nodes[node_id][value_id] = new_value all over. I changed + the code to instead do graph.node_data(node_id) but I think that + users are used to doing it the old way => need to create a NodeView for RX... + * graph.edges() + * NX + * RX + * graph.edges[edge]["random_weight"] = weight + * NX + * RX + * graph.node_indices + * NX + * RX + * graph.neighbors(node) + * NX + * RX + * graph.add_edge(node, node) + => next_node(node) local function needs to return int node ID + * NX + * RX + * graph.degree(node) + * NX + * RX + * iter(graph) + * NX + * RX + * graph._degrees[node] => what is _degrees? + * NX + * RX + * graph.edges + * NX + * RX +""" diff --git a/tests/frm_tests/test_frm_make_graph.py b/tests/frm_tests/test_frm_make_graph.py new file mode 100644 index 00000000..c4f23bd9 --- /dev/null +++ b/tests/frm_tests/test_frm_make_graph.py @@ -0,0 +1,298 @@ +################################################################ +# +# frm: This file was copied from test_make_graph.py (to make +# use of their fixtures. It should eventually evolve into +# a reasonable test of additional functions added by me +# to gerrychain.graph +# +################################################################ + +import pathlib +from tempfile import TemporaryDirectory +from unittest.mock import patch + +import geopandas as gp +import pandas +import pytest +from shapely.geometry import Polygon +from pyproj import CRS + +from gerrychain.graph import Graph +from gerrychain.graph.geo import GeometryError + +import networkx + + +@pytest.fixture +def geodataframe(): + a = Polygon([(0, 0), (0, 1), (1, 1), (1, 0)]) + b = Polygon([(0, 1), (0, 2), (1, 2), (1, 1)]) + c = Polygon([(1, 0), (1, 1), (2, 1), (2, 0)]) + d = Polygon([(1, 1), (1, 2), (2, 2), (2, 1)]) + df = gp.GeoDataFrame({"ID": ["a", "b", "c", "d"], "geometry": [a, b, c, d]}) + df.crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" + return df + + +@pytest.fixture +def gdf_with_data(geodataframe): + geodataframe["data"] = list(range(len(geodataframe))) + geodataframe["data2"] = list(range(len(geodataframe))) + return geodataframe + + +@pytest.fixture +def geodataframe_with_boundary(): + """ + abe + ade + ace + """ + a = Polygon([(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (1, 2), (1, 1), (1, 0)]) + b = Polygon([(1, 2), (1, 3), (2, 3), (2, 2)]) + c = Polygon([(1, 0), (1, 1), (2, 1), (2, 0)]) + d = Polygon([(1, 1), (1, 2), (2, 2), (2, 1)]) + e = Polygon([(2, 0), (2, 1), (2, 2), (2, 3), (3, 3), (3, 2), (3, 1), (3, 0)]) + df = gp.GeoDataFrame({"ID": ["a", "b", "c", "d", "e"], "geometry": [a, b, c, d, e]}) + df.crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" + return df + + +@pytest.fixture +def shapefile(gdf_with_data): + with TemporaryDirectory() as d: + filepath = pathlib.Path(d) / "temp.shp" + filename = str(filepath.absolute()) + gdf_with_data.to_file(filename) + yield filename + + +@pytest.fixture +def target_file(): + with TemporaryDirectory() as d: + filepath = pathlib.Path(d) / "temp.shp" + filename = str(filepath.absolute()) + yield filename + + +def test_add_data_to_graph_can_handle_column_names_that_start_with_numbers(): + nx_graph = networkx.Graph([("01", "02"), ("02", "03"), ("03", "01")]) + df = pandas.DataFrame({"16SenDVote": [20, 30, 50], "node": ["01", "02", "03"]}) + df = df.set_index("node") + + graph = Graph.from_networkx(nx_graph) + graph.add_data(df, ["16SenDVote"]) + + assert nx_graph.nodes["01"]["16SenDVote"] == 20 + assert nx_graph.nodes["02"]["16SenDVote"] == 30 + assert nx_graph.nodes["03"]["16SenDVote"] == 50 + + assert graph.node_data("01")["16SenDVote"] == 20 + assert graph.node_data("02")["16SenDVote"] == 30 + assert graph.node_data("03")["16SenDVote"] == 50 + + +def test_join_can_handle_right_index(): + nx_graph = networkx.Graph([("01", "02"), ("02", "03"), ("03", "01")]) + df = pandas.DataFrame({"16SenDVote": [20, 30, 50], "node": ["01", "02", "03"]}) + + graph = Graph.from_networkx(nx_graph) + + graph.join(df, ["16SenDVote"], right_index="node") + + assert graph.node_data("01")["16SenDVote"] == 20 + assert graph.node_data("02")["16SenDVote"] == 30 + assert graph.node_data("03")["16SenDVote"] == 50 + + +def test_make_graph_from_dataframe_creates_graph(geodataframe): + graph = Graph.from_geodataframe(geodataframe) + assert isinstance(graph, Graph) + + +def test_make_graph_from_dataframe_preserves_df_index(geodataframe): + df = geodataframe.set_index("ID") + graph = Graph.from_geodataframe(df) + assert set(graph.nodes) == {"a", "b", "c", "d"} + + +def test_make_graph_from_dataframe_gives_correct_graph(geodataframe): + df = geodataframe.set_index("ID") + graph = Graph.from_geodataframe(df) + + assert edge_set_equal( + set(graph.edges), {("a", "b"), ("a", "c"), ("b", "d"), ("c", "d")} + ) + + +def test_make_graph_works_with_queen_adjacency(geodataframe): + df = geodataframe.set_index("ID") + graph = Graph.from_geodataframe(df, adjacency="queen") + + assert edge_set_equal( + set(graph.edges), + {("a", "b"), ("a", "c"), ("b", "d"), ("c", "d"), ("a", "d"), ("b", "c")}, + ) + + +def test_can_pass_queen_or_rook_strings_to_control_adjacency(geodataframe): + df = geodataframe.set_index("ID") + graph = Graph.from_geodataframe(df, adjacency="queen") + + assert edge_set_equal( + set(graph.edges), + {("a", "b"), ("a", "c"), ("b", "d"), ("c", "d"), ("a", "d"), ("b", "c")}, + ) + + +def test_can_insist_on_not_reprojecting(geodataframe): + df = geodataframe.set_index("ID") + graph = Graph.from_geodataframe(df, reproject=False) + + for node in ("a", "b", "c", "d"): + assert graph.node_data(node)["area"] == 1 + + for edge in graph.edges: + assert graph.edge_data(edge)["shared_perim"] == 1 + + +def test_does_not_reproject_by_default(geodataframe): + df = geodataframe.set_index("ID") + graph = Graph.from_geodataframe(df) + + for node in ("a", "b", "c", "d"): + assert graph.node_data(node)["area"] == 1.0 + + for edge in graph.edges: + assert graph.edge_data(edge)["shared_perim"] == 1.0 + + +def test_reproject(geodataframe): + # I don't know what the areas and perimeters are in UTM for these made-up polygons, + # but I'm pretty sure they're not 1. + df = geodataframe.set_index("ID") + graph = Graph.from_geodataframe(df, reproject=True) + + for node in ("a", "b", "c", "d"): + assert graph.node_data(node)["area"] != 1 + + for edge in graph.edges: + assert graph.edge_data(edge)["shared_perim"] != 1 + + +def test_identifies_boundary_nodes(geodataframe_with_boundary): + df = geodataframe_with_boundary.set_index("ID") + graph = Graph.from_geodataframe(df) + + for node in ("a", "b", "c", "e"): + assert graph.node_data(node)["boundary_node"] + assert not graph.node_data("d")["boundary_node"] + + +def test_computes_boundary_perims(geodataframe_with_boundary): + df = geodataframe_with_boundary.set_index("ID") + graph = Graph.from_geodataframe(df, reproject=False) + + expected = {"a": 5, "e": 5, "b": 1, "c": 1} + + for node, value in expected.items(): + assert graph.node_data(node)["boundary_perim"] == value + + +def edge_set_equal(set1, set2): + return {(y, x) for x, y in set1} | set1 == {(y, x) for x, y in set2} | set2 + + +def test_from_file_adds_all_data_by_default(shapefile): + graph = Graph.from_file(shapefile) + + # frm: Original Code: + # Get all of the data dictionaries for each node and verify that each + # of them contains data with the key "data" and "data2" + # + # assert all("data" in node_data for node_data in graph.nodes.values()) + # assert all("data2" in node_data for node_data in graph.nodes.values()) + + # data dictionaries for all of the nodes + all_node_data = [graph.node_data(node_id) for node_id in graph.node_indices] + + assert all("data" in node_data for node_data in all_node_data) + assert all("data2" in node_data for node_data in all_node_data) + + +def test_from_file_and_then_to_json_does_not_error(shapefile, target_file): + graph = Graph.from_file(shapefile) + + # Even the geometry column is copied to the graph + + # data dictionaries for all of the nodes + all_node_data = [graph.node_data(node_id) for node_id in graph.node_indices] + + assert all("geometry" in node_data for node_data in all_node_data) + + graph.to_json(target_file) + + +def test_from_file_and_then_to_json_with_geometries(shapefile, target_file): + graph = Graph.from_file(shapefile) + + # data dictionaries for all of the nodes + all_node_data = [graph.node_data(node_id) for node_id in graph.node_indices] + + # Even the geometry column is copied to the graph + assert all("geometry" in node_data for node_data in all_node_data) + + # frm: ??? Does anything check that the file is actually written? + graph.to_json(target_file, include_geometries_as_geojson=True) + + +def test_graph_warns_for_islands(): + nx_graph = networkx.Graph() + nx_graph.add_node(0) + graph = Graph.from_networkx(nx_graph) + + with pytest.warns(Warning): + graph.warn_for_islands() + + +def test_graph_raises_if_crs_is_missing_when_reprojecting(geodataframe): + geodataframe.crs = None + + with pytest.raises(ValueError): + Graph.from_geodataframe(geodataframe, reproject=True) + + +def test_raises_geometry_error_if_invalid_geometry(shapefile): + with patch("gerrychain.graph.geo.explain_validity") as explain: + explain.return_value = "Invalid geometry" + with pytest.raises(GeometryError): + Graph.from_file(shapefile, ignore_errors=False) + + +def test_can_ignore_errors_while_making_graph(shapefile): + with patch("gerrychain.graph.geo.explain_validity") as explain: + explain.return_value = "Invalid geometry" + assert Graph.from_file(shapefile, ignore_errors=True) + + +def test_data_and_geometry(gdf_with_data): + df = gdf_with_data + graph = Graph.from_geodataframe(df, cols_to_add=["data","data2"]) + assert graph.geometry is df.geometry + #graph.add_data(df[["data"]]) + assert (graph.data["data"] == df["data"]).all() + #graph.add_data(df[["data2"]]) + assert list(graph.data.columns) == ["data", "data2"] + + +def test_make_graph_from_dataframe_has_crs(gdf_with_data): + graph = Graph.from_geodataframe(gdf_with_data) + assert CRS.from_json(graph.graph["crs"]).equals(gdf_with_data.crs) + +def test_make_graph_from_shapefile_has_crs(shapefile): + graph = Graph.from_file(shapefile) + df = gp.read_file(shapefile) + assert CRS.from_json(graph.graph["crs"]).equals(df.crs) + + + diff --git a/tests/frm_tests/test_frm_nx_rx_graph.py b/tests/frm_tests/test_frm_nx_rx_graph.py new file mode 100644 index 00000000..3cdec14e --- /dev/null +++ b/tests/frm_tests/test_frm_nx_rx_graph.py @@ -0,0 +1,232 @@ +####################################################### +# Overview of test_frm_nx_rx_graph.py +####################################################### +""" + +A collection of tests to verify that the new GerryChain +Graph object works the same with NetworkX and RustworkX. + + +""" + +import matplotlib.pyplot as plt +from gerrychain import (Partition, Graph, MarkovChain, + updaters, constraints, accept) +from gerrychain.proposals import recom +from gerrychain.constraints import contiguous +from functools import partial +import pandas + +import os +import rustworkx as rx +import networkx as nx + +import pytest + + +# Set the random seed so that the results are reproducible! +import random +random.seed(2024) + +############################################################ +# Create Graph Objects - both direct NX.Graph and RX.PyGraph +# objects and two GerryChain Graph objects that embed the +# NX and RX graphs. +############################################################ + +@pytest.fixture(scope="module") +def json_file_path(): + # Get path to the JSON containing graph data + test_file_path = os.path.abspath(__file__) + cur_directory = os.path.dirname(test_file_path) + path_for_json_file = os.path.join(cur_directory, "gerrymandria.json") + # print("json file is: ", json_file_path) + return path_for_json_file + +@pytest.fixture(scope="module") +def gerrychain_nx_graph(json_file_path): + # Create an NX based Graph object from the JSON + graph = Graph.from_json(json_file_path) + print("gerrychain_nx_graph: len(graph): ", len(graph)) + return(graph) + +@pytest.fixture(scope="module") +def nx_graph(gerrychain_nx_graph): + # Fetch the NX graph object from inside the Graph object + return gerrychain_nx_graph.get_nx_graph() + +@pytest.fixture(scope="module") +def rx_graph(nx_graph): + # Create an RX graph object from NX, preserving node data + return rx.networkx_converter(nx_graph, keep_attributes=True) + +@pytest.fixture(scope="module") +def gerrychain_rx_graph(rx_graph): + # Create a Graph object with an RX graph inside + return Graph.from_rustworkx(rx_graph) + +################## +# Start of Tests +################## + +def test_sanity(): + # frm: if you call pytest with -rP, then it will show stdout for tests + print("test_sanity(): called") + assert True + +def test_nx_rx_sets_of_nodes_agree(nx_graph, rx_graph): + nx_set_of_nodes = set(nx_graph.nodes()) + rx_set_of_nodes = set(rx_graph.node_indices()) + assert nx_set_of_nodes == rx_set_of_nodes + +def test_nx_rx_node_data_agree(gerrychain_nx_graph, gerrychain_rx_graph): + nx_data_dict = gerrychain_nx_graph.node_data(1) + rx_data_dict = gerrychain_rx_graph.node_data(1) + assert nx_data_dict == rx_data_dict + +def test_nx_rx_node_indices_agree(gerrychain_nx_graph, gerrychain_rx_graph): + nx_node_indices = gerrychain_nx_graph.node_indices + rx_node_indices = gerrychain_rx_graph.node_indices + assert nx_node_indices == rx_node_indices + +def test_nx_rx_edges_agree(gerrychain_nx_graph, gerrychain_rx_graph): + # TODO: Rethink this test. At the moment it relies on the edge_list() + # call which does not exist on a GerryChain Graph object + # being handled by RX through clever __getattr__ stuff. + # I think we should add an edge_list() method to GerryChain Graph + nx_edges = set(gerrychain_nx_graph.edges) + rx_edges = set(gerrychain_rx_graph.edge_list()) + assert nx_edges == rx_edges + +def test_nx_rx_node_neighbors_agree(gerrychain_nx_graph, gerrychain_rx_graph): + for i in gerrychain_nx_graph: + # Need to convert to set, because ordering of neighbor nodes differs in the lists + nx_neighbors = set(gerrychain_nx_graph.neighbors(i)) + rx_neighbors = set(gerrychain_rx_graph.neighbors(i)) + assert nx_neighbors == rx_neighbors + +def test_nx_rx_subgraphs_agree(gerrychain_nx_graph, gerrychain_rx_graph): + subgraph_nodes = [0,1,2,3,4,5] # TODO: make this a fixture dependent on JSON graph + nx_subgraph = gerrychain_nx_graph.subgraph(subgraph_nodes) + rx_subgraph = gerrychain_rx_graph.subgraph(subgraph_nodes) + for node_id in nx_subgraph: + nx_node_data = nx_subgraph.node_data(node_id) + rx_node_data = rx_subgraph.node_data(node_id) + assert nx_node_data == rx_node_data + # frm: TODO: This does not test that the rx_subgraph has the exact same number of + # nodes as the nx_subgraph, and it does not test edge data... + +def test_nx_rx_degrees_agree(gerrychain_nx_graph, gerrychain_rx_graph): + # Verify that the degree of each node agrees between NX and RX versions + nx_degrees = { + node_id: gerrychain_nx_graph.degree(node_id) for node_id in gerrychain_nx_graph.node_indices + } + rx_degrees = { + node_id: gerrychain_rx_graph.degree(node_id) for node_id in gerrychain_rx_graph.node_indices + } + for node_id in gerrychain_nx_graph.node_indices: + assert nx_degrees[node_id] == rx_degrees[node_id] + + +""" +frm: TODO: + + * Functions: + * predecessors() + * successors() + * is_connected() + * laplacian_matrix() + * normalized_laplacian_matrix() + * neighbors() + I think this has been done for both NX and RX + * networkx.generators.lattice.grid_2d_graph() + * nx.to_dict_of_lists() + * nx.tree.minimum_spanning_tree() + * nx.number_connected_components() + * nx.set_edge_attributes() + * nx.set_node_attributes() + + * Syntax: + * graph.edges + NX - note that edges and edges() do exactly the same thing. They return + an EdgeView of a list of edges with edge_id being a tuple indicating + the start and end node_ids for the edge. + Need to find out how edges and edges() is used in the code to know + what the right thing to do is for RX - that is, what aspect of an + EdgeView is used in the code? Is a set of tuples OK? + * graph.nodes + NX returns a NodeView with the node_ids for the nodes + RX does not have a "nodes" attribute, but it does have a nodes() + method which does something different. It returns a list (indexed + by node_id) of the data associated with nodes. + So, I need to see how Graph.nodes is used in the code to see what the + right way is to support it in RX. + * graph.nodes[node_id] + returns data dictionary for the node + * graph.nodes[node_id][attr_id] + returns the value for the given attribute for that node's data + * graph.add_edge() + Done differently in NX and RX + * graph.degree + * graph.subgraph + * for edge in graph.edge_indices: + graph.edges[edge]["weight"] = random.random() + In RX, assigning the weight to an edge is done differently... + Note that edge_indices currently works exactly the same for both + NX and RX - returning a set of tuples (for edges). However, + assigning a value to the "weight" attribute of an edge is done + differently... + * islands() +""" + + + + + + +### my_updaters = { +### "population": updaters.Tally("TOTPOP"), +### "cut_edges": updaters.cut_edges +### } +### +### initial_partition = Partition( +### nx_graph, +### assignment="district", +### updaters=my_updaters +### ) +### +### # This should be 8 since each district has 1 person in it. +### # Note that the key "population" corresponds to the population updater +### # that we defined above and not with the population column in the json file. +### ideal_population = sum(initial_partition["population"].values()) / len(initial_partition) +### +### proposal = partial( +### recom, +### pop_col="TOTPOP", +### pop_target=ideal_population, +### epsilon=0.01, +### node_repeats=2 +### ) +### +### print("Got proposal") +### +### recom_chain = MarkovChain( +### proposal=proposal, +### constraints=[contiguous], +### accept=accept.always_accept, +### initial_state=initial_partition, +### total_steps=40 +### ) +### +### print("Set up Markov Chain") +### +### assignment_list = [] +### +### for i, item in enumerate(recom_chain): +### print(f"Finished step {i+1}/{len(recom_chain)}") +### assignment_list.append(item.assignment) +### +### print("Enumerated the chain: number of entries in list is: ", len(assignment_list)) +### +### def test_success(): +### len(assignment_list) == 40 diff --git a/tests/frm_tests/test_frm_old_vs_new_graph.py b/tests/frm_tests/test_frm_old_vs_new_graph.py new file mode 100644 index 00000000..8563980f --- /dev/null +++ b/tests/frm_tests/test_frm_old_vs_new_graph.py @@ -0,0 +1,127 @@ +# +# This tests compatibility between the old/original version of +# the Graph object and the new version that encapsulates the +# graph as a data member - either nx_graph or rx_graph. +# + +import matplotlib.pyplot as plt +from gerrychain import (Partition, Graph, MarkovChain, + updaters, constraints, accept) +from gerrychain.graph import OriginalGraph +from gerrychain.proposals import recom +from gerrychain.constraints import contiguous +from functools import partial +import pandas + +import os +import rustworkx as rx +import networkx as nx + + +# Set the random seed so that the results are reproducible! +import random +random.seed(2024) + + +test_file_path = os.path.abspath(__file__) +cur_directory = os.path.dirname(test_file_path) +json_file_path = os.path.join(cur_directory, "gerrymandria.json") +print("json file is: ", json_file_path) + +new_graph = Graph.from_json(json_file_path) +old_graph = OriginalGraph.from_json(json_file_path) + + +print("Created old and new Graph objects from JSON") + +# frm: DEBUGGING: +# print("created new_graph") +# print("type of new_graph.nodes is: ", type(new_graph.nodes)) +new_graph_nodes = new_graph.nodes +old_graph_nodes = list(old_graph.nodes) +# print("new_graph nodes: ", list(new_graph.nodes)) +# print("new_graph edges: ", list(new_graph.edges)) +# print("") # newline +# print("created old_graph") +# print("type of old_graph.nodes is: ", type(old_graph.nodes)) +# print("old_graph nodes: ", list(old_graph.nodes)) +# print("old_graph edges: ", list(old_graph.edges)) + +print("testing that graph.nodes have same length") +assert(len(new_graph.nodes) == len(old_graph.nodes)), "lengths disagree" + +new_graph_edges = new_graph.edges +old_graph_edges = set(old_graph.edges) +print("testing that graph.edges have same length") +assert(len(new_graph_edges) == len(old_graph_edges)), "lengths disagree" + +node_subset = set([1,2,3,4,5]) +new_graph_subset = new_graph.subgraph(node_subset) +print("type of new_graph.subset is: ", type(new_graph_subset)) +print(new_graph_subset.edges) +old_graph_subset = old_graph.subgraph(node_subset) +print("type of old_graph.subset is: ", type(old_graph_subset)) +print(old_graph_subset.edges) + +# print("created frm_graph") +# print("FrmGraph nodes: ", list(frm_graph.nodes)) +# print("FrmGraph edges: ", list(frm_graph.edges)) + +print("About to test Graph.predecessors(root)") +pred = new_graph.predecessors(1) +print(list(pred)) + +# frm: TODO: Flesh out this test... + + +# +# The code below is from the regression test - maybe +# it will be useful in the future, maybe not... +# + +### my_updaters = { +### "population": updaters.Tally("TOTPOP"), +### "cut_edges": updaters.cut_edges +### } +### +### initial_partition = Partition( +### new_graph, +### assignment="district", +### updaters=my_updaters +### ) +### +### # This should be 8 since each district has 1 person in it. +### # Note that the key "population" corresponds to the population updater +### # that we defined above and not with the population column in the json file. +### ideal_population = sum(initial_partition["population"].values()) / len(initial_partition) +### +### proposal = partial( +### recom, +### pop_col="TOTPOP", +### pop_target=ideal_population, +### epsilon=0.01, +### node_repeats=2 +### ) +### +### print("Got proposal") +### +### recom_chain = MarkovChain( +### proposal=proposal, +### constraints=[contiguous], +### accept=accept.always_accept, +### initial_state=initial_partition, +### total_steps=40 +### ) +### +### print("Set up Markov Chain") +### +### assignment_list = [] +### +### for i, item in enumerate(recom_chain): +### print(f"Finished step {i+1}/{len(recom_chain)}") +### assignment_list.append(item.assignment) +### +### print("Enumerated the chain: number of entries in list is: ", len(assignment_list)) +### +### def test_success(): +### len(assignment_list) == 40 diff --git a/tests/frm_tests/test_frm_regression.py b/tests/frm_tests/test_frm_regression.py new file mode 100644 index 00000000..cac1134d --- /dev/null +++ b/tests/frm_tests/test_frm_regression.py @@ -0,0 +1,89 @@ +############################################################### +# +# frm: Overview of test_frm_regression.py +# +# This code was copied from the GerryChain User Guide / Tutorial as a way +# to have a functional test that exercised the overall logic of GerryChain. +# +# It is NOT comprehensive, but it does get all the way to executing +# a chain. +# +# It is a quick and dirty way to make sure I haven't really screwed things up ;-) +# + +import matplotlib.pyplot as plt +from gerrychain import (Partition, Graph, MarkovChain, + updaters, constraints, accept) +from gerrychain.proposals import recom +from gerrychain.constraints import contiguous +from functools import partial +import pandas + +import os + + +# Set the random seed so that the results are reproducible! +import random +random.seed(2024) + + +test_file_path = os.path.abspath(__file__) +cur_directory = os.path.dirname(test_file_path) +json_file_path = os.path.join(cur_directory, "gerrymandria.json") +print("json file is: ", json_file_path) + +graph = Graph.from_json(json_file_path) + +print("Created Graph from JSON") + +# frm: DEBUGGING: +# print("created graph") +# print("nodes: ", list(graph.nodes)) +# print("edges: ", list(graph.edges)) + +my_updaters = { + "population": updaters.Tally("TOTPOP"), + "cut_edges": updaters.cut_edges +} + +initial_partition = Partition( + graph, + assignment="district", + updaters=my_updaters +) + +# This should be 8 since each district has 1 person in it. +# Note that the key "population" corresponds to the population updater +# that we defined above and not with the population column in the json file. +ideal_population = sum(initial_partition["population"].values()) / len(initial_partition) + +proposal = partial( + recom, + pop_col="TOTPOP", + pop_target=ideal_population, + epsilon=0.01, + node_repeats=2 +) + +print("Got proposal") + +recom_chain = MarkovChain( + proposal=proposal, + constraints=[contiguous], + accept=accept.always_accept, + initial_state=initial_partition, + total_steps=40 +) + +print("Set up Markov Chain") + +assignment_list = [] + +for i, item in enumerate(recom_chain): + print(f"Finished step {i+1}/{len(recom_chain)}") + assignment_list.append(item.assignment) + +print("Enumerated the chain: number of entries in list is: ", len(assignment_list)) + +def test_success(): + len(assignment_list) == 40 diff --git a/tests/frm_tests/test_frm_sanity.py b/tests/frm_tests/test_frm_sanity.py new file mode 100644 index 00000000..aa3f72f3 --- /dev/null +++ b/tests/frm_tests/test_frm_sanity.py @@ -0,0 +1,3 @@ + +def test_doit(): + assert 2 + 2 == 4 diff --git a/tests/optimization/test_single_metric.py b/tests/optimization/test_single_metric.py index e9868492..c6cdc968 100644 --- a/tests/optimization/test_single_metric.py +++ b/tests/optimization/test_single_metric.py @@ -501,6 +501,7 @@ def opt_fn(partition): ): max_scores_sb[i] = optimizer.best_score + # frm: TODO: stmt below fails with 1.0 != 2 assert np.max(max_scores_sb) == 2 @@ -550,6 +551,7 @@ def opt_fn(partition): ): max_scores_anneal[i] = optimizer.best_score + # frm: TODO: stmt below fails. assert np.max(max_scores_anneal) == 2 diff --git a/tests/partition/test_partition.py b/tests/partition/test_partition.py index d9e42dbc..b35a95f9 100644 --- a/tests/partition/test_partition.py +++ b/tests/partition/test_partition.py @@ -12,6 +12,13 @@ def test_Partition_can_be_flipped(example_partition): + # frm: TODO: Verify that this flip is in internal RX-based graph node_ids and not "original" NX node_ids + # + # My guess is that this flip is intended to be in original node_ids but that the test works + # anyways because the assertion uses the same numbers. It should probably be changed to use + # original node_ids and to translate the node_id and part in the assert into internal node_ids + # just to make it crystal clear to anyone following later what is going on... + flip = {1: 2} new_partition = example_partition.flip(flip) assert new_partition.assignment[1] == 2 @@ -45,6 +52,9 @@ def test_Partition_knows_cut_edges_K3(example_partition): def test_propose_random_flip_proposes_a_partition(example_partition): partition = example_partition + + # frm: TODO: Verify that propose_random_flip() to make sure it is doing the right thing + # wrt RX-based node_ids vs. original node_ids. proposal = propose_random_flip(partition) assert isinstance(proposal, partition.__class__) @@ -54,10 +64,10 @@ def example_geographic_partition(): graph = Graph.from_networkx(networkx.complete_graph(3)) assignment = {0: 1, 1: 1, 2: 2} for node in graph.nodes: - graph.nodes[node]["boundary_node"] = False - graph.nodes[node]["area"] = 1 + graph.node_data(node)["boundary_node"] = False + graph.node_data(node)["area"] = 1 for edge in graph.edges: - graph.edges[edge]["shared_perim"] = 1 + graph.edge_data(edge)["shared_perim"] = 1 return GeographicPartition(graph, assignment, None, None, None) @@ -69,15 +79,32 @@ def test_geographic_partition_can_be_instantiated(example_geographic_partition): def test_Partition_parts_is_a_dictionary_of_parts_to_nodes(example_partition): partition = example_partition flip = {1: 2} - new_partition = partition.flip(flip) + new_partition = partition.flip(flip, use_original_node_ids=True) assert all(isinstance(nodes, frozenset) for nodes in new_partition.parts.values()) assert all(isinstance(nodes, frozenset) for nodes in partition.parts.values()) def test_Partition_has_subgraphs(example_partition): + # Test that subgraphs work as intended. + # The partition has two parts (districts) with IDs: 1, 2 + # Part #1 has nodes 0, 1, so the subgraph for part #1 should have these nodes + # Part #2 has node 2, so the subgraph for part #1 should have this node + + # Note that the original node_ids are based on the original NX-based graph + # The node_ids in the partition's graph have been changed by the conversion + # from NX to RX, so we need to be careful about when to use "original" node_ids + # and when to use "internal" RX-based node_ids + partition = example_partition - assert set(partition.subgraphs[1].nodes) == {0, 1} - assert set(partition.subgraphs[2].nodes) == {2} + + subgraph_for_part_1 = partition.subgraphs[1] + internal_node_id_0 = subgraph_for_part_1.internal_node_id_for_original_node_id(0) + internal_node_id_1 = subgraph_for_part_1.internal_node_id_for_original_node_id(1) + assert set(partition.subgraphs[1].nodes) == {internal_node_id_0, internal_node_id_1} + + subgraph_for_part_2 = partition.subgraphs[2] + internal_node_id = subgraph_for_part_2.internal_node_id_for_original_node_id(2) + assert set(partition.subgraphs[2].nodes) == {internal_node_id} assert len(list(partition.subgraphs)) == 2 @@ -92,10 +119,20 @@ def test_partition_implements_getattr_for_updater_access(example_partition): def test_can_be_created_from_a_districtr_file(graph, districtr_plan_file): for node in graph: - graph.nodes[node]["area_num_1"] = node + graph.node_data(node)["area_num_1"] = node + + # frm: TODO: NX vs. RX node_id issues here... partition = Partition.from_districtr_file(graph, districtr_plan_file) - assert partition.assignment.to_dict() == { + + # Convert internal node_ids of the partition's graph to "original" node_ids + internal_node_assignment = partition.assignment.to_dict() + original_node_assignment = {} + for internal_node_id, part in internal_node_assignment.items(): + original_node_id = partition.graph.original_node_id_for_internal_node_id(internal_node_id) + original_node_assignment[original_node_id] = part + + assert original_node_assignment == { 0: 1, 1: 1, 2: 1, diff --git a/tests/partition/test_plotting.py b/tests/partition/test_plotting.py index b9916319..20374d94 100644 --- a/tests/partition/test_plotting.py +++ b/tests/partition/test_plotting.py @@ -3,13 +3,15 @@ import geopandas as gp import pytest from shapely.geometry import Polygon +import networkx from gerrychain import Graph, Partition @pytest.fixture def partition(): - graph = Graph([(0, 1), (1, 3), (2, 3), (0, 2)]) + nx_graph = networkx.Graph([(0, 1), (1, 3), (2, 3), (0, 2)]) + graph = Graph.from_networkx(nx_graph) return Partition(graph, {0: 1, 1: 1, 2: 2, 3: 2}) @@ -66,5 +68,31 @@ def test_uses_graph_geometries_by_default(self, geodataframe): graph = Graph.from_geodataframe(geodataframe) partition = Partition(graph=graph, assignment={node: 0 for node in graph}) + + # frm: TODO: the following statement blows up because we do not copy + # geometry data from NX to RX when we convert to RX. + # Need to grok what the right way to deal with geometry + # data is (is it only an issue for from_geodataframe() or + # are there other ways a geometry value might be set?) + # + # Peter comments (from PR): + # + # The geometry data should only exist on the attached geodataframe. + # In fact, if there is no "geometry" column in the dataframe, this call + # should fail. + # + # Fixing the plotting functions is a low-priority. I need to set up + # snapshot tests for these anyway, so if you find working with + # matplotlib a PITA (because it is), then don't worry about the + # plotting functions for now. + # + # Worst-case scenario, I can just add some temporary verbage to + # readthedocs telling people to use + # + # my_partition.df.plot() + + # Which will just use all of the plotting stuff that Pandas has set up internally. + partition.plot() assert mock_plot.call_count == 1 + \ No newline at end of file diff --git a/tests/test_chain.py b/tests/test_chain.py index 3910df2f..8f4902b3 100644 --- a/tests/test_chain.py +++ b/tests/test_chain.py @@ -4,12 +4,12 @@ class MockState: - def flip(self, changes): + def flip(self, changes, use_original_node_ids): return MockState() def mock_proposal(state): - return state.flip({1: 2}) + return state.flip({1: 2}, use_original_node_ids=True) def mock_accept(state): diff --git a/tests/test_laplacian.py b/tests/test_laplacian.py new file mode 100644 index 00000000..25ab2572 --- /dev/null +++ b/tests/test_laplacian.py @@ -0,0 +1,95 @@ + +import pytest +import networkx as nx +import rustworkx as rx +import numpy as np +from gerrychain.graph import Graph +import gerrychain.tree as gctree + +""" +This tests whether we compute the same laplacian matrix for NX and RX +based Graph objects. + +The NX version is computed (as was true in the old code) by a built-in +NetworkX routine. The RX version is computed by code added when we +supported RX as the embedded graph object. + +The NX version produces ints from the code below, while the RX +version produces floats. I don't think this matters as the laplacian +matrix is used to do numerical calculations, so that code should +happily use ints or floats, but it means that for this test I need +to convert the NX version's result to have floating point values. +""" + +# frm: TODO: Add additional tests for laplacian matrix calculations, in +# particular, add a test for normalized_laplacian_matrix() +# once that routine has been implemented. + + +def are_sparse_matrices_equal(sparse_matrix1, sparse_matrix2, rtol=1e-05, atol=1e-08): + """ + Checks if two scipy.sparse.csr_matrix objects are equal, considering + potential floating-point inaccuracies in the data. + + Args: + sparse_matrix1 (scipy.sparse.csr_matrix): The first sparse matrix. + sparse_matrix2 (scipy.sparse.csr_matrix): The second sparse matrix. + rtol (float): The relative tolerance parameter for np.allclose. + atol (float): The absolute tolerance parameter for np.allclose. + + Returns: + bool: True if the sparse matrices are equal, False otherwise. + """ + # Check if shapes are equal + if sparse_matrix1.shape != sparse_matrix2.shape: + return False + + # Check if the number of non-zero elements is equal + if sparse_matrix1.nnz != sparse_matrix2.nnz: + return False + + # Check for equality of structural components (indices and indptr) + # These should be exact matches + if not (np.array_equal(sparse_matrix1.indices, sparse_matrix2.indices) and + np.array_equal(sparse_matrix1.indptr, sparse_matrix2.indptr)): + return False + + # Check for approximate equality of data (values) + # Use np.allclose to handle floating-point comparisons + if not np.allclose(sparse_matrix1.data, sparse_matrix2.data, rtol=rtol, atol=atol): + return False + + return True + +# Create equivalent NX and RX graphs from scratch + +@pytest.fixture +def nx_graph(): + this_nx_graph = nx.Graph([(0, 1), (0, 2), (1, 2), (2, 3)]) + return this_nx_graph + +@pytest.fixture +def rx_graph(): + this_rx_graph = rx.PyGraph() + this_rx_graph.add_nodes_from([0, 1, 2, 3]) + this_rx_graph.add_edges_from([(0, 1, "data"), (0, 2, "data"), (1, 2, "data"), (2, 3, "data")]) + return this_rx_graph + + +def test_nx_rx_laplacian_matrix_equality(nx_graph, rx_graph): + + # Create Graph objects from the NX and RX graphs + gc_nx_graph = Graph.from_networkx(nx_graph) + gc_rx_graph = Graph.from_rustworkx(rx_graph) + + # Compute the laplacian_matrix for both the NX and RX based Graph objects + gc_nx_laplacian_matrix = gc_nx_graph.laplacian_matrix() + gc_rx_laplacian_matrix = gc_rx_graph.laplacian_matrix() + + # Convert values in the NX version to be floating point + float_gc_nx_laplacian_matrix = gc_nx_laplacian_matrix.astype(float) + + # test equality + matrices_are_equal = are_sparse_matrices_equal(float_gc_nx_laplacian_matrix, gc_rx_laplacian_matrix) + assert(matrices_are_equal) + diff --git a/tests/test_make_graph.py b/tests/test_make_graph.py index e7c1a5c8..1fa1d96c 100644 --- a/tests/test_make_graph.py +++ b/tests/test_make_graph.py @@ -8,9 +8,14 @@ from shapely.geometry import Polygon from pyproj import CRS +import networkx + from gerrychain.graph import Graph from gerrychain.graph.geo import GeometryError +# frm: added following import +# from gerrychain.graph import node_data + @pytest.fixture def geodataframe(): @@ -65,27 +70,43 @@ def target_file(): def test_add_data_to_graph_can_handle_column_names_that_start_with_numbers(): - graph = Graph([("01", "02"), ("02", "03"), ("03", "01")]) + + # frm: Test has been modified to work with new Graph object that has an NetworkX.Graph + # object embedded inside it. I am not sure if this test actually tests + # anything useful anymore... + + nx_graph = networkx.Graph([("01", "02"), ("02", "03"), ("03", "01")]) df = pandas.DataFrame({"16SenDVote": [20, 30, 50], "node": ["01", "02", "03"]}) df = df.set_index("node") + # frm: Note that the new Graph does not support add_data() + + graph = Graph.from_networkx(nx_graph) + graph.add_data(df, ["16SenDVote"]) - assert graph.nodes["01"]["16SenDVote"] == 20 - assert graph.nodes["02"]["16SenDVote"] == 30 - assert graph.nodes["03"]["16SenDVote"] == 50 + # Test that the embedded nx_graph object has the added data + assert nx_graph.nodes["01"]["16SenDVote"] == 20 + assert nx_graph.nodes["02"]["16SenDVote"] == 30 + assert nx_graph.nodes["03"]["16SenDVote"] == 50 + + # Test that the graph object has the added data + assert graph.node_data("01")["16SenDVote"] == 20 + assert graph.node_data("02")["16SenDVote"] == 30 + assert graph.node_data("03")["16SenDVote"] == 50 def test_join_can_handle_right_index(): - graph = Graph([("01", "02"), ("02", "03"), ("03", "01")]) + nx_graph = networkx.Graph([("01", "02"), ("02", "03"), ("03", "01")]) df = pandas.DataFrame({"16SenDVote": [20, 30, 50], "node": ["01", "02", "03"]}) - graph.join(df, ["16SenDVote"], right_index="node") + graph = Graph.from_networkx(nx_graph) - assert graph.nodes["01"]["16SenDVote"] == 20 - assert graph.nodes["02"]["16SenDVote"] == 30 - assert graph.nodes["03"]["16SenDVote"] == 50 + graph.join(df, ["16SenDVote"], right_index="node") + assert graph.node_data("01")["16SenDVote"] == 20 + assert graph.node_data("02")["16SenDVote"] == 30 + assert graph.node_data("03")["16SenDVote"] == 50 def test_make_graph_from_dataframe_creates_graph(geodataframe): graph = Graph.from_geodataframe(geodataframe) @@ -132,10 +153,10 @@ def test_can_insist_on_not_reprojecting(geodataframe): graph = Graph.from_geodataframe(df, reproject=False) for node in ("a", "b", "c", "d"): - assert graph.nodes[node]["area"] == 1 + assert graph.node_data(node)["area"] == 1 for edge in graph.edges: - assert graph.edges[edge]["shared_perim"] == 1 + assert graph.edge_data(edge)["shared_perim"] == 1 def test_does_not_reproject_by_default(geodataframe): @@ -143,10 +164,10 @@ def test_does_not_reproject_by_default(geodataframe): graph = Graph.from_geodataframe(df) for node in ("a", "b", "c", "d"): - assert graph.nodes[node]["area"] == 1.0 + assert graph.node_data(node)["area"] == 1.0 for edge in graph.edges: - assert graph.edges[edge]["shared_perim"] == 1.0 + assert graph.edge_data(edge)["shared_perim"] == 1.0 def test_reproject(geodataframe): @@ -156,10 +177,10 @@ def test_reproject(geodataframe): graph = Graph.from_geodataframe(df, reproject=True) for node in ("a", "b", "c", "d"): - assert graph.nodes[node]["area"] != 1 + assert graph.node_data(node)["area"] != 1 for edge in graph.edges: - assert graph.edges[edge]["shared_perim"] != 1 + assert graph.edge_data(edge)["shared_perim"] != 1 def test_identifies_boundary_nodes(geodataframe_with_boundary): @@ -167,8 +188,8 @@ def test_identifies_boundary_nodes(geodataframe_with_boundary): graph = Graph.from_geodataframe(df) for node in ("a", "b", "c", "e"): - assert graph.nodes[node]["boundary_node"] - assert not graph.nodes["d"]["boundary_node"] + assert graph.node_data(node)["boundary_node"] + assert not graph.node_data("d")["boundary_node"] def test_computes_boundary_perims(geodataframe_with_boundary): @@ -178,7 +199,7 @@ def test_computes_boundary_perims(geodataframe_with_boundary): expected = {"a": 5, "e": 5, "b": 1, "c": 1} for node, value in expected.items(): - assert graph.nodes[node]["boundary_perim"] == value + assert graph.node_data(node)["boundary_perim"] == value def edge_set_equal(set1, set2): @@ -188,31 +209,39 @@ def edge_set_equal(set1, set2): def test_from_file_adds_all_data_by_default(shapefile): graph = Graph.from_file(shapefile) - assert all("data" in node_data for node_data in graph.nodes.values()) - assert all("data2" in node_data for node_data in graph.nodes.values()) + nx_graph = graph.get_nx_graph() + + assert all("data" in node_data for node_data in nx_graph.nodes.values()) + assert all("data2" in node_data for node_data in nx_graph.nodes.values()) def test_from_file_and_then_to_json_does_not_error(shapefile, target_file): graph = Graph.from_file(shapefile) + nx_graph = graph.get_nx_graph() + # Even the geometry column is copied to the graph - assert all("geometry" in node_data for node_data in graph.nodes.values()) + assert all("geometry" in node_data for node_data in nx_graph.nodes.values()) graph.to_json(target_file) def test_from_file_and_then_to_json_with_geometries(shapefile, target_file): graph = Graph.from_file(shapefile) + + nx_graph = graph.get_nx_graph() # Even the geometry column is copied to the graph - assert all("geometry" in node_data for node_data in graph.nodes.values()) + assert all("geometry" in node_data for node_data in nx_graph.nodes.values()) graph.to_json(target_file, include_geometries_as_geojson=True) def test_graph_warns_for_islands(): - graph = Graph() - graph.add_node(0) + nx_graph = networkx.Graph() + nx_graph.add_node(0) + + graph = Graph.from_networkx(nx_graph) with pytest.warns(Warning): graph.warn_for_islands() @@ -255,4 +284,4 @@ def test_make_graph_from_dataframe_has_crs(gdf_with_data): def test_make_graph_from_shapefile_has_crs(shapefile): graph = Graph.from_file(shapefile) df = gp.read_file(shapefile) - assert CRS.from_json(graph.graph["crs"]).equals(df.crs) \ No newline at end of file + assert CRS.from_json(graph.graph["crs"]).equals(df.crs) diff --git a/tests/test_metagraph.py b/tests/test_metagraph.py index 03aa2d59..4b5a587a 100644 --- a/tests/test_metagraph.py +++ b/tests/test_metagraph.py @@ -12,12 +12,21 @@ def partition(graph): def test_all_cut_edge_flips(partition): + result = set( (node, part) for flip in all_cut_edge_flips(partition) for node, part in flip.items() ) - assert result == {(6, 1), (7, 1), (8, 1), (4, 2), (5, 2), (3, 2)} + + # Convert from internal node_ids to "original" node_ids + new_result = set() + for internal_node_id, part in result: + original_node_id = partition.graph.original_node_id_for_internal_node_id(internal_node_id) + new_result.add((original_node_id, part)) + + # frm: TODO: stmt below fails - the "result" has (2,2) instead of (3,2) + assert new_result == {(6, 1), (7, 1), (8, 1), (4, 2), (5, 2), (3, 2)} class TestAllValidStatesOneFlipAway: @@ -35,6 +44,7 @@ def test_accepts_list_of_constraints(self, partition): def test_all_valid_flips(partition): + # frm: TODO: NX vs. RX node_id issues... def disallow_six_to_one(partition): for node, part in partition.flips.items(): if node == 6 and part == 1: @@ -48,4 +58,12 @@ def disallow_six_to_one(partition): for flip in all_valid_flips(partition, constraints) for node, part in flip.items() ) - assert result == {(7, 1), (8, 1), (4, 2), (5, 2), (3, 2)} + + # Convert from internal node_ids to "original" node_ids + new_result = set() + for internal_node_id, part in result: + original_node_id = partition.graph.original_node_id_for_internal_node_id(internal_node_id) + new_result.add((original_node_id, part)) + + # frm: TODO: stmt below fails - the "result" has (2,2) instead of (3,2) + assert new_result == {(7, 1), (8, 1), (4, 2), (5, 2), (3, 2)} diff --git a/tests/test_region_aware.py b/tests/test_region_aware.py index bcbe1e0f..50b22ac2 100644 --- a/tests/test_region_aware.py +++ b/tests/test_region_aware.py @@ -161,9 +161,16 @@ def straddled_regions(partition, reg_attr, all_reg_names): """Returns the total number of district that straddle two regions in the partition.""" split = {name: 0 for name in all_reg_names} + # frm: TODO: Grok what this tests - not clear to me at this time... + + # frm: Original Code: + # for node1, node2 in set(partition.graph.edges() - partition["cut_edges"]): + # split[partition.graph.nodes[node1][reg_attr]] += 1 + # split[partition.graph.nodes[node2][reg_attr]] += 1 + # for node1, node2 in set(partition.graph.edges() - partition["cut_edges"]): - split[partition.graph.nodes[node1][reg_attr]] += 1 - split[partition.graph.nodes[node2][reg_attr]] += 1 + split[partition.graph.node_data(node1)[reg_attr]] += 1 + split[partition.graph.node_data(node2)[reg_attr]] += 1 return sum(1 for value in split.values() if value > 0) @@ -237,6 +244,10 @@ def test_region_aware_muni_warning(): with pytest.warns(UserWarning) as record: # Random seed 2 should succeed, but drawing the # tree is hard, so we should get a warning + # frm: TODO: stmt below fails - saying too many attempts: + # + # raise RuntimeError(f"Could not find a possible cut after {max_attempts} attempts.") + # RuntimeError: Could not find a possible cut after 10000 attempts. run_chain_dual( seed=2, steps=1000, diff --git a/tests/test_tally.py b/tests/test_tally.py index 220bc149..46000848 100644 --- a/tests/test_tally.py +++ b/tests/test_tally.py @@ -8,6 +8,7 @@ import random from gerrychain.updaters.tally import DataTally, Tally random.seed(2018) +import networkx def random_assignment(graph, num_districts): return {node: random.choice(range(num_districts)) for node in graph.nodes} @@ -27,17 +28,30 @@ def test_data_tally_works_as_an_updater(three_by_three_grid): def test_data_tally_gives_expected_value(three_by_three_grid): + # Put all but one of the nodes in part #1, and put the one "first_node" + # into part #2. + first_node = next(iter(three_by_three_grid.nodes)) assignment = {node: 1 for node in three_by_three_grid.nodes} assignment[first_node] = 2 + # All nodes get a value of 1 for the data to be tallied data = {node: 1 for node in three_by_three_grid} updaters = {"tally": DataTally(data, alias="tally")} partition = Partition(three_by_three_grid, assignment, updaters) + # Note that in general a flip using node_ids generated before creating + # a partition should be translated into "internal" RX-Graph based + # node_ids. In this case it is not needed, because it doesn't matter + # whether we are using the "original" or the "internal" node_id for + # first_node because it still refers to the same node and nothing else + # is going on. + + # Create a new partition, adding the "first_node" to part #1 flip = {first_node: 1} new_partition = partition.flip(flip) + # The "tally" should increase by one because of the flipped node's data assert new_partition["tally"][1] == partition["tally"][1] + 1 @@ -49,7 +63,7 @@ def test_data_tally_mimics_old_tally_usage(graph_with_random_data_factory): assignment = {i: 1 if i in range(4) else 2 for i in range(9)} partition = Partition(graph, assignment, updaters) - expected_total_in_district_one = sum(graph.nodes[i]["total"] for i in range(4)) + expected_total_in_district_one = sum(graph.node_data(i)["total"] for i in range(4)) assert partition["total"][1] == expected_total_in_district_one @@ -68,7 +82,7 @@ def get_expected_tally(partition): expected = defaultdict(int) for node in partition.graph.nodes: part = partition.assignment[node] - expected[part] += partition.graph.nodes[node]["population"] + expected[part] += partition.graph.node_data(node)["population"] return expected for state in chain: @@ -77,9 +91,10 @@ def get_expected_tally(partition): def test_works_when_no_flips_occur(): - graph = Graph([(0, 1), (1, 2), (2, 3), (3, 0)]) + nx_graph = networkx.Graph([(0, 1), (1, 2), (2, 3), (3, 0)]) + graph = Graph.from_networkx(nx_graph) for node in graph: - graph.nodes[node]["pop"] = node + 1 + graph.node_data(node)["pop"] = node + 1 partition = Partition(graph, {0: 0, 1: 0, 2: 1, 3: 1}, {"pop": Tally("pop")}) chain = MarkovChain(lambda p: p.flip({}), [], always_accept, partition, 10) diff --git a/tests/test_tree.py b/tests/test_tree.py index 1805b8ca..d2458205 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -30,8 +30,8 @@ @pytest.fixture def graph_with_pop(three_by_three_grid): for node in three_by_three_grid: - three_by_three_grid.nodes[node]["pop"] = 1 - return Graph.from_networkx(three_by_three_grid) + three_by_three_grid.node_data(node)["pop"] = 1 + return three_by_three_grid @pytest.fixture @@ -54,18 +54,19 @@ def twelve_by_twelve_with_pop(): def test_bipartition_tree_returns_a_subset_of_nodes(graph_with_pop): - ideal_pop = sum(graph_with_pop.nodes[node]["pop"] for node in graph_with_pop) / 2 + ideal_pop = sum(graph_with_pop.node_data(node)["pop"] for node in graph_with_pop) / 2 result = bipartition_tree(graph_with_pop, "pop", ideal_pop, 0.25, 10) + # frm: TODO: Next stmt fails - the result is not a frozenset... assert isinstance(result, frozenset) assert all(node in graph_with_pop.nodes for node in result) def test_bipartition_tree_returns_within_epsilon_of_target_pop(graph_with_pop): - ideal_pop = sum(graph_with_pop.nodes[node]["pop"] for node in graph_with_pop) / 2 + ideal_pop = sum(graph_with_pop.node_data(node)["pop"] for node in graph_with_pop) / 2 epsilon = 0.25 result = bipartition_tree(graph_with_pop, "pop", ideal_pop, epsilon, 10) - part_pop = sum(graph_with_pop.nodes[node]["pop"] for node in result) + part_pop = sum(graph_with_pop.node_data(node)["pop"] for node in result) assert abs(part_pop - ideal_pop) / ideal_pop < epsilon @@ -75,7 +76,7 @@ def test_recursive_tree_part_returns_within_epsilon_of_target_pop( n_districts = 7 # 144/7 ≈ 20.5 nodes/subgraph (1 person/node) ideal_pop = ( sum( - twelve_by_twelve_with_pop.nodes[node]["pop"] + twelve_by_twelve_with_pop.node_data(node)["pop"] for node in twelve_by_twelve_with_pop ) ) / n_districts @@ -90,7 +91,13 @@ def test_recursive_tree_part_returns_within_epsilon_of_target_pop( partition = Partition( twelve_by_twelve_with_pop, result, updaters={"pop": Tally("pop")} ) - return all( + # frm: Original Code: + # + # return all( + # abs(part_pop - ideal_pop) / ideal_pop < epsilon + # for part_pop in partition["pop"].values() + # ) + assert all( abs(part_pop - ideal_pop) / ideal_pop < epsilon for part_pop in partition["pop"].values() ) @@ -102,7 +109,7 @@ def test_recursive_tree_part_returns_within_epsilon_of_target_pop_using_contract n_districts = 7 # 144/7 ≈ 20.5 nodes/subgraph (1 person/node) ideal_pop = ( sum( - twelve_by_twelve_with_pop.nodes[node]["pop"] + twelve_by_twelve_with_pop.node_data(node)["pop"] for node in twelve_by_twelve_with_pop ) ) / n_districts @@ -122,7 +129,14 @@ def test_recursive_tree_part_returns_within_epsilon_of_target_pop_using_contract partition = Partition( twelve_by_twelve_with_pop, result, updaters={"pop": Tally("pop")} ) - return all( + # frm: Original Code: + # + # return all( + # abs(part_pop - ideal_pop) / ideal_pop < epsilon + # for part_pop in partition["pop"].values() + # ) + # + assert all( abs(part_pop - ideal_pop) / ideal_pop < epsilon for part_pop in partition["pop"].values() ) @@ -134,7 +148,7 @@ def test_recursive_seed_part_returns_within_epsilon_of_target_pop( n_districts = 7 # 144/7 ≈ 20.5 nodes/subgraph (1 person/node) ideal_pop = ( sum( - twelve_by_twelve_with_pop.nodes[node]["pop"] + twelve_by_twelve_with_pop.node_data(node)["pop"] for node in twelve_by_twelve_with_pop ) ) / n_districts @@ -151,7 +165,13 @@ def test_recursive_seed_part_returns_within_epsilon_of_target_pop( partition = Partition( twelve_by_twelve_with_pop, result, updaters={"pop": Tally("pop")} ) - return all( + # frm: Original Code: + # + # return all( + # abs(part_pop - ideal_pop) / ideal_pop < epsilon + # for part_pop in partition["pop"].values() + # ) + assert all( abs(part_pop - ideal_pop) / ideal_pop < epsilon for part_pop in partition["pop"].values() ) @@ -163,7 +183,7 @@ def test_recursive_seed_part_returns_within_epsilon_of_target_pop_using_contract n_districts = 7 # 144/7 ≈ 20.5 nodes/subgraph (1 person/node) ideal_pop = ( sum( - twelve_by_twelve_with_pop.nodes[node]["pop"] + twelve_by_twelve_with_pop.node_data(node)["pop"] for node in twelve_by_twelve_with_pop ) ) / n_districts @@ -185,7 +205,13 @@ def test_recursive_seed_part_returns_within_epsilon_of_target_pop_using_contract partition = Partition( twelve_by_twelve_with_pop, result, updaters={"pop": Tally("pop")} ) - return all( + # frm: Original Code: + # + # return all( + # abs(part_pop - ideal_pop) / ideal_pop < epsilon + # for part_pop in partition["pop"].values() + # ) + assert all( abs(part_pop - ideal_pop) / ideal_pop < epsilon for part_pop in partition["pop"].values() ) @@ -210,7 +236,7 @@ def dummy_method(graph, pop_col, pop_target, epsilon, node_repeats, one_sided_cu n_districts = 7 # 144/7 ≈ 20.5 nodes/subgraph (1 person/node) ideal_pop = ( sum( - twelve_by_twelve_with_pop.nodes[node]["pop"] + twelve_by_twelve_with_pop.node_data(node)["pop"] for node in twelve_by_twelve_with_pop ) ) / n_districts @@ -238,7 +264,7 @@ def test_recursive_seed_part_with_n_unspecified_within_epsilon( n_districts = 6 # This should set n=3 ideal_pop = ( sum( - twelve_by_twelve_with_pop.nodes[node]["pop"] + twelve_by_twelve_with_pop.node_data(node)["pop"] for node in twelve_by_twelve_with_pop ) ) / n_districts @@ -254,7 +280,13 @@ def test_recursive_seed_part_with_n_unspecified_within_epsilon( partition = Partition( twelve_by_twelve_with_pop, result, updaters={"pop": Tally("pop")} ) - return all( + # frm: Original Code: + # + # return all( + # abs(part_pop - ideal_pop) / ideal_pop < epsilon + # for part_pop in partition["pop"].values() + # ) + assert all( abs(part_pop - ideal_pop) / ideal_pop < epsilon for part_pop in partition["pop"].values() ) @@ -267,16 +299,17 @@ def test_random_spanning_tree_returns_tree_with_pop_attribute(graph_with_pop): def test_uniform_spanning_tree_returns_tree_with_pop_attribute(graph_with_pop): tree = uniform_spanning_tree(graph_with_pop) + # frm: TODO: Get rid of networkx dependency assert networkx.is_tree(tree) def test_bipartition_tree_returns_a_tree(graph_with_pop): - ideal_pop = sum(graph_with_pop.nodes[node]["pop"] for node in graph_with_pop) / 2 + ideal_pop = sum(graph_with_pop.node_data(node)["pop"] for node in graph_with_pop) / 2 tree = Graph.from_networkx( networkx.Graph([(0, 1), (1, 2), (1, 4), (3, 4), (4, 5), (3, 6), (6, 7), (6, 8)]) ) for node in tree: - tree.nodes[node]["pop"] = graph_with_pop.nodes[node]["pop"] + tree.node_data(node)["pop"] = graph_with_pop.node_data(node)["pop"] result = bipartition_tree( graph_with_pop, "pop", ideal_pop, 0.25, 10, tree, lambda x: 4 @@ -290,7 +323,7 @@ def test_bipartition_tree_returns_a_tree(graph_with_pop): def test_recom_works_as_a_proposal(partition_with_pop): graph = partition_with_pop.graph - ideal_pop = sum(graph.nodes[node]["pop"] for node in graph) / 2 + ideal_pop = sum(graph.node_data(node)["pop"] for node in graph) / 2 proposal = functools.partial( recom, pop_col="pop", pop_target=ideal_pop, epsilon=0.25, node_repeats=5 ) @@ -305,12 +338,58 @@ def test_recom_works_as_a_proposal(partition_with_pop): def test_reversible_recom_works_as_a_proposal(partition_with_pop): random.seed(2018) graph = partition_with_pop.graph - ideal_pop = sum(graph.nodes[node]["pop"] for node in graph) / 2 + ideal_pop = sum(graph.node_data(node)["pop"] for node in graph) / 2 proposal = functools.partial( reversible_recom, pop_col="pop", pop_target=ideal_pop, epsilon=0.10, M=1 ) constraints = [within_percent_of_ideal_population(partition_with_pop, 0.25, "pop")] + # frm: ???: I am not sure how epsilon of 0.10 interacts with the constraint. + # + # The issue is that there are 9 nodes each with a population of 1, so the ideal population + # is 4.5. But no matter how you split the graph, you end up with an integer population, say, + # 4 or 5 - so you will never get within 0.10 of 4.5. + # + # I am not quite sure what is being tested here... + # + # within_percent_of_ideal_population() returns a Bounds object which contains the lower and + # upper bounds for a given value - in this case 0.25 percent of the ideal population. + # + # The more I did into this the more I shake my head. The value of "epsilon" passed into the + # reversible_recom() seems to only ever be used when creating a PopulatedGraph which in turn + # only ever uses it when doing a specific balanced edge cut algorithm. That is, the value of + # epsilon is very rarely used, and yet it is passed in as one of the important paramters to + # reversible_recom(). It looks like the original coders thought that it would be a great thing + # to have in the PopulatedGraph object, but then they didn't actually use it. *sigh* + # + # Then this test defines a constraint for population defining it to be OK if the population + # is within 25% of ideal - which is at odds with the value of epsilon above of 10%, but since + # the value of epsilon (of 10%) is never used, whatever... + # + + # frm: TODO: Grok this test - what is it trying to accomplish? + # + # The proposal uses reversible_recom() with the default value for the "repeat_until_valid" + # parameter which is False. This means that the call to try to combine and then split two + # parts (districts) only gets one shot at it before it fails. In this case, that means that + # it fails EVERY time - because the initial spanning tree that is returned is not balanced + # enough to satisfy the population constraint. If you let it run, then it succeeds after + # a couple of attempts (I think 10), but it never succeeds on the first try, and there is no + # randomness possible since we only have two parts (districts) that we can merge. + # + # So this test runs through 100 chain iterations doing NOTHING - returning the same partition + # each iteration, and in fact returning the same partition at the end that it started with. + # + # This raises all sorts of issues: + # + # * Makes no sense for this test + # * Questions the logic in reversible_recom() to not detect an infinite loop + # * Questions the logic that does not inform the user somehow that the chain is ineffective + # * Raises the issue of documentation of the code - it took me quite a long time to + # figure out WTF was going on... + # + + chain = MarkovChain(proposal, constraints, lambda x: True, partition_with_pop, 100) for state in chain: @@ -395,16 +474,16 @@ def test_prime_bound(): def test_bipartition_tree_random_returns_a_subset_of_nodes(graph_with_pop): - ideal_pop = sum(graph_with_pop.nodes[node]["pop"] for node in graph_with_pop) / 2 + ideal_pop = sum(graph_with_pop.node_data(node)["pop"] for node in graph_with_pop) / 2 result = bipartition_tree_random(graph_with_pop, "pop", ideal_pop, 0.25, 10) assert isinstance(result, frozenset) assert all(node in graph_with_pop.nodes for node in result) def test_bipartition_tree_random_returns_within_epsilon_of_target_pop(graph_with_pop): - ideal_pop = sum(graph_with_pop.nodes[node]["pop"] for node in graph_with_pop) / 2 + ideal_pop = sum(graph_with_pop.node_data(node)["pop"] for node in graph_with_pop) / 2 epsilon = 0.25 result = bipartition_tree_random(graph_with_pop, "pop", ideal_pop, epsilon, 10) - part_pop = sum(graph_with_pop.nodes[node]["pop"] for node in result) + part_pop = sum(graph_with_pop.node_data(node)["pop"] for node in result) assert abs(part_pop - ideal_pop) / ideal_pop < epsilon diff --git a/tests/updaters/dbg.py b/tests/updaters/dbg.py new file mode 100644 index 00000000..41c19b30 --- /dev/null +++ b/tests/updaters/dbg.py @@ -0,0 +1,79 @@ +import math + +import networkx +import pytest + +from gerrychain import MarkovChain +from gerrychain.constraints import Validator, no_vanishing_districts +from gerrychain.graph import Graph +from gerrychain.partition import Partition +from gerrychain.proposals import propose_random_flip +import random +from gerrychain.updaters import (Election, Tally, boundary_nodes, cut_edges, + cut_edges_by_part, exterior_boundaries, + exterior_boundaries_as_a_set, + interior_boundaries, perimeter) +from gerrychain.updaters.election import ElectionResults +random.seed(2018) + + +def create_three_by_three_grid(): + """Returns a graph that looks like this: + 0 1 2 + 3 4 5 + 6 7 8 + """ + nx_graph = networkx.Graph() + nx_graph.add_edges_from( + [ + (0, 1), + (0, 3), + (1, 2), + (1, 4), + (2, 5), + (3, 4), + (3, 6), + (4, 5), + (4, 7), + (5, 8), + (6, 7), + (7, 8), + ] + ) + return Graph.from_networkx(nx_graph) + + + + + +def random_assignment(graph, num_districts): + assignment = {node: random.choice(range(num_districts)) for node in graph.nodes} + # Make sure that there are cut edges: + while len(set(assignment.values())) == 1: + assignment = {node: random.choice(range(num_districts)) for node in graph.nodes} + return assignment + + + +def test_vote_proportion_returns_nan_if_total_votes_is_zero(three_by_three_grid): + election = Election("Mock Election", ["D", "R"], alias="election") + graph = three_by_three_grid + + for node in graph.nodes: + for col in election.columns: + graph.node_data(node)[col] = 0 + + updaters = {"election": election} + assignment = random_assignment(graph, 3) + + partition = Partition(graph, assignment, updaters) + + assert all( + math.isnan(value) + for party_percents in partition["election"].percents_for_party.values() + for value in party_percents.values() + ) + + +three_by_three_grid = create_three_by_three_grid() +test_vote_proportion_returns_nan_if_total_votes_is_zero(three_by_three_grid) diff --git a/tests/updaters/test_cut_edges.py b/tests/updaters/test_cut_edges.py index e6582f41..757ec09b 100644 --- a/tests/updaters/test_cut_edges.py +++ b/tests/updaters/test_cut_edges.py @@ -27,6 +27,13 @@ def invalid_cut_edges(partition): ] return invalid +def translate_flips_to_internal_node_ids(partition, flips): + # Translate flips into the internal_node_ids for the partition + internal_flips = {} + for original_node_id, part in flips.items(): + internal_node_id = partition.graph.internal_node_id_for_original_node_id(original_node_id) + internal_flips[internal_node_id] = part + return internal_flips def test_cut_edges_doesnt_duplicate_edges_with_different_order_of_nodes( three_by_three_grid, @@ -39,10 +46,13 @@ def test_cut_edges_doesnt_duplicate_edges_with_different_order_of_nodes( # 222 222 flip = {4: 2, 2: 1, 5: 1} - new_partition = Partition(parent=partition, flips=flip) + internal_flips = translate_flips_to_internal_node_ids(partition, flip) + + new_partition = Partition(parent=partition, flips=internal_flips) result = new_partition["cut_edges"] + # Verify that the same edge is not in the result twice (just in different node_id order) for edge in result: assert (edge[1], edge[0]) not in result @@ -56,13 +66,16 @@ def test_cut_edges_can_handle_multiple_flips(three_by_three_grid): # 222 222 flip = {4: 2, 2: 1, 5: 1} - new_partition = Partition(parent=partition, flips=flip) + internal_flips = translate_flips_to_internal_node_ids(partition, flip) + + new_partition = Partition(parent=partition, flips=internal_flips) result = new_partition["cut_edges"] naive_cut_edges = { - tuple(sorted(edge)) for edge in graph.edges if new_partition.crosses_parts(edge) + tuple(sorted(edge)) for edge in partition.graph.edges if new_partition.crosses_parts(edge) } + assert result == naive_cut_edges @@ -78,7 +91,9 @@ def test_cut_edges_by_part_doesnt_duplicate_edges_with_opposite_order_of_nodes( # 222 222 flip = {4: 2, 2: 1, 5: 1} - new_partition = Partition(parent=partition, flips=flip) + internal_flips = translate_flips_to_internal_node_ids(partition, flip) + + new_partition = Partition(parent=partition, flips=internal_flips) result = new_partition["cut_edges_by_part"] @@ -97,11 +112,13 @@ def test_cut_edges_by_part_gives_same_total_edges_as_naive_method(three_by_three # 222 222 flip = {4: 2, 2: 1, 5: 1} - new_partition = Partition(parent=partition, flips=flip) + internal_flips = translate_flips_to_internal_node_ids(partition, flip) + + new_partition = Partition(parent=partition, flips=internal_flips) result = new_partition["cut_edges_by_part"] naive_cut_edges = { - tuple(sorted(edge)) for edge in graph.edges if new_partition.crosses_parts(edge) + tuple(sorted(edge)) for edge in partition.graph.edges if new_partition.crosses_parts(edge) } assert naive_cut_edges == { @@ -115,11 +132,15 @@ def test_implementation_of_cut_edges_matches_naive_method(three_by_three_grid): partition = Partition(graph, assignment, {"cut_edges": cut_edges}) flip = {4: 2} - new_partition = Partition(parent=partition, flips=flip) + + internal_flips = translate_flips_to_internal_node_ids(partition, flip) + + new_partition = Partition(parent=partition, flips=internal_flips) + result = cut_edges(new_partition) naive_cut_edges = { - edge for edge in graph.edges if new_partition.crosses_parts(edge) + edge for edge in partition.graph.edges if new_partition.crosses_parts(edge) } assert edge_set_equal(result, naive_cut_edges) diff --git a/tests/updaters/test_perimeters.py b/tests/updaters/test_perimeters.py index 05c1f156..ded769cb 100644 --- a/tests/updaters/test_perimeters.py +++ b/tests/updaters/test_perimeters.py @@ -8,8 +8,13 @@ def setup(): + + # Note that the node_ids for the NX graph for a grid are tuples with the (x,y) position of the node + grid = Grid((4, 4), with_diagonals=False) - flipped_grid = grid.flip({(2, 1): 3}) + + flipped_grid = grid.flip({(2, 1): 3}, use_original_node_ids=True) + return grid, flipped_grid @@ -34,25 +39,37 @@ def test_cut_edges_by_part_handles_flips_with_a_simple_grid(): result = flipped_grid["cut_edges_by_part"] - assert result[0] == { + # Translate internal edges so that they can be compared to the literals below + new_result = {} + for part, set_of_edges in result.items(): + new_set_of_edges = set() + for edge in set_of_edges: + new_edge = ( + flipped_grid.graph.original_node_id_for_internal_node_id(edge[0]), + flipped_grid.graph.original_node_id_for_internal_node_id(edge[1]), + ) + new_set_of_edges.add(new_edge) + new_result[part] = new_set_of_edges + + assert new_result[0] == { ((1, 0), (2, 0)), ((1, 1), (2, 1)), ((0, 1), (0, 2)), ((1, 1), (1, 2)), } - assert result[1] == { + assert new_result[1] == { ((1, 0), (2, 0)), ((2, 0), (2, 1)), ((2, 1), (3, 1)), ((3, 1), (3, 2)), } - assert result[2] == { + assert new_result[2] == { ((0, 1), (0, 2)), ((1, 1), (1, 2)), ((1, 2), (2, 2)), ((1, 3), (2, 3)), } - assert result[3] == { + assert new_result[3] == { ((1, 1), (2, 1)), ((2, 0), (2, 1)), ((2, 1), (3, 1)), @@ -99,9 +116,9 @@ def test_perimeter_match_naive_perimeter_at_every_step(): def get_exterior_boundaries(partition): graph_boundary = partition["boundary_nodes"] exterior = defaultdict(lambda: 0) - for node in graph_boundary: - part = partition.assignment[node] - exterior[part] += partition.graph.nodes[node]["boundary_perim"] + for node_id in graph_boundary: + part = partition.assignment[node_id] + exterior[part] += partition.graph.node_data(node_id)["boundary_perim"] return exterior def get_interior_boundaries(partition): @@ -111,9 +128,9 @@ def get_interior_boundaries(partition): interior = defaultdict(int) for edge in cut_edges: for node in edge: - interior[partition.assignment[node]] += partition.graph.edges[edge][ - "shared_perim" - ] + interior[partition.assignment[node]] += partition.graph.edge_data( + partition.graph.get_edge_id_from_edge(edge) + )["shared_perim"] return interior def expected_perimeter(partition): diff --git a/tests/updaters/test_split_scores.py b/tests/updaters/test_split_scores.py index c26a32de..bb0b15e8 100644 --- a/tests/updaters/test_split_scores.py +++ b/tests/updaters/test_split_scores.py @@ -4,6 +4,15 @@ from gerrychain.updaters.locality_split_scores import LocalitySplits from gerrychain.updaters.cut_edges import cut_edges from gerrychain import Graph +import networkx + +# frm: TODO: This test fails due to NX dependencies in locality_split_scores.py +# +# There are lots of comments in that file about what needs to be fixed, but +# it is a low priority becauxe the code in locality_split_scores.py is not used +# in the gerrychain codebase - it is presumeably used by other users of GC, so +# this needs to be fixed sometime - but later... +# @pytest.fixture def three_by_three_grid(): @@ -12,8 +21,8 @@ def three_by_three_grid(): 3 4 5 6 7 8 """ - graph = Graph() - graph.add_edges_from( + nx_graph = networkx.Graph() + nx_graph.add_edges_from( [ (0, 1), (0, 3), @@ -29,20 +38,21 @@ def three_by_three_grid(): (7, 8), ] ) + graph = Graph.from_networkx(nx_graph) return graph @pytest.fixture def graph_with_counties(three_by_three_grid): for node in [0, 1, 2]: - three_by_three_grid.nodes[node]["county"] = "a" - three_by_three_grid.nodes[node]["pop"] = 1 + three_by_three_grid.node_data(node)["county"] = "a" + three_by_three_grid.node_data(node)["pop"] = 1 for node in [3, 4, 5]: - three_by_three_grid.nodes[node]["county"] = "b" - three_by_three_grid.nodes[node]["pop"] = 1 + three_by_three_grid.node_data(node)["county"] = "b" + three_by_three_grid.node_data(node)["pop"] = 1 for node in [6, 7, 8]: - three_by_three_grid.nodes[node]["county"] = "c" - three_by_three_grid.nodes[node]["pop"] = 1 + three_by_three_grid.node_data(node)["county"] = "c" + three_by_three_grid.node_data(node)["pop"] = 1 return three_by_three_grid @@ -69,10 +79,7 @@ def split_partition(graph_with_counties): ) return partition - - - - +# frm: TODO: NX vs. RX node_id issues here. class TestSplittingScores: diff --git a/tests/updaters/test_splits.py b/tests/updaters/test_splits.py index 1b6c26fa..1bea1bff 100644 --- a/tests/updaters/test_splits.py +++ b/tests/updaters/test_splits.py @@ -9,11 +9,11 @@ @pytest.fixture def graph_with_counties(three_by_three_grid): for node in [0, 1, 2]: - three_by_three_grid.nodes[node]["county"] = "a" + three_by_three_grid.node_data(node)["county"] = "a" for node in [3, 4, 5]: - three_by_three_grid.nodes[node]["county"] = "b" + three_by_three_grid.node_data(node)["county"] = "b" for node in [6, 7, 8]: - three_by_three_grid.nodes[node]["county"] = "c" + three_by_three_grid.node_data(node)["county"] = "c" return three_by_three_grid @@ -43,12 +43,14 @@ def test_describes_splits_for_all_counties(self, partition): assert set(result.keys()) == {"a", "b", "c"} - after_a_flip = partition.flip({3: 1}) + after_a_flip = partition.flip({3: 1}, use_original_node_ids=True) second_result = after_a_flip["splits"] assert set(second_result.keys()) == {"a", "b", "c"} def test_no_splits(self, graph_with_counties): + + # frm: TODO: Why does this not just use "split_partition"? Isn't it the same? partition = Partition(graph_with_counties, assignment="county") result = compute_county_splits(partition, "county", None) @@ -57,7 +59,9 @@ def test_no_splits(self, graph_with_counties): assert splits_info.split == CountySplit.NOT_SPLIT def test_new_split(self, partition): - after_a_flip = partition.flip({3: 1}) + # Do a flip, using the node_ids of the original assignment (rather than the + # node_ids used internally in the RX-based graph) + after_a_flip = partition.flip({3: 1}, use_original_node_ids=True) result = after_a_flip["splits"] # County b is now split, but a and c are not @@ -74,7 +78,9 @@ def test_initial_split(self, split_partition): assert result["c"].split == CountySplit.NOT_SPLIT def test_old_split(self, split_partition): - after_a_flip = split_partition.flip({4: 1}) + # Do a flip, using the node_ids of the original assignment (rather than the + # node_ids used internally in the RX-based graph) + after_a_flip = split_partition.flip({4: 1}, use_original_node_ids=True) result = after_a_flip["splits"] # County b becomes more split @@ -87,11 +93,11 @@ def test_old_split(self, split_partition): "previous partition, which is not the intuitive behavior." ) def test_initial_split_that_disappears_and_comes_back(self, split_partition): - no_splits = split_partition.flip({3: 2}) + no_splits = split_partition.flip({3: 2}, use_original_node_ids=True) result = no_splits["splits"] assert all(info.split == CountySplit.NOT_SPLIT for info in result.values()) - split_comes_back = no_splits.flip({3: 1}) + split_comes_back = no_splits.flip({3: 1}, use_original_node_ids=True) new_result = split_comes_back["splits"] assert new_result["a"].split == CountySplit.NOT_SPLIT assert new_result["b"].split == CountySplit.OLD_SPLIT diff --git a/tests/updaters/test_updaters.py b/tests/updaters/test_updaters.py index 37a4b97e..f560d717 100644 --- a/tests/updaters/test_updaters.py +++ b/tests/updaters/test_updaters.py @@ -33,10 +33,59 @@ def random_assignment(graph, num_districts): def partition_with_election(graph_with_d_and_r_cols): graph = graph_with_d_and_r_cols assignment = random_assignment(graph, 3) + """ + # frm: TODO: NX vs RX Issue here - node_ids in parties_to_columns are in NX context... + + This is an "interesting" issue - mostly meaning it is a PITA. + + The Election class allows you to specify what you want to tally as either a node + attribute or with an explicit external mapping of node_ids to values to be added. + + The problem is that if you pass in an explicit external mapping, you are almost + certainly using node_ids that are the "original" NX node_ids which will NOT be + the same as the new RX node_ids assigned when a partition is created. + + Note that if you just pass in an attribute name to get the data off the node + then there is no problem - the Tally code just uses the partition's part (district) + and nodes in the part (district) information. No translation to/from original + node_ids necessary. + + One approach to fixing this would be to just assume that any explicit mapping + should have the node_ids remapped after the partition is created. This could + be done by having the Election class defer doing the tally until AFTER the + partition has been created - the code would check to see if the tally exists, + and if it does not, then it would use the partition's information to + translate the parties_to_columns data to use internal node_ids and compute + the initial tally. After that, subsequent tallies for the next partition in + the chain should just work... + + I am just not sure it is worth the extra complexity to continue to support + an explicit external mapping of node_ids to vote totals... + + Need to ask Peter what he thinks we should do. Do external / legacy users + use this??? + + """ + # + # This is a royal pain, because we do not yet have a partition that tells us how + # to map these "original" node_ids into "internal" node_ids. + # + # Even worse, this is a conceptual problem, since this use case - setting up an + # Election before creating a partition is perhaps a common use case, so we don't + # want to make it complicated for users. + # + # Need to think about what the proper solution is. Should the Election updater + # translate from "original" node_ids to "internal" node_ids - perhaps keeping a + # cache of the mapping to make it more efficient? + # + # What would the migration path be for 1) legacy NX users and 2) future RX users? + # + parties_to_columns = { - "D": {node: graph.nodes[node]["D"] for node in graph.nodes}, - "R": {node: graph.nodes[node]["R"] for node in graph.nodes}, + "D": {node: graph.node_data(node)["D"] for node in graph.nodes}, + "R": {node: graph.node_data(node)["R"] for node in graph.nodes}, } + election = Election("Mock Election", parties_to_columns) updaters = {"Mock Election": election, "cut_edges": cut_edges} return Partition(graph, assignment, updaters) @@ -54,21 +103,33 @@ def chain_with_election(partition_with_election): def test_Partition_can_update_stats(): - graph = networkx.complete_graph(3) + nx_graph = networkx.complete_graph(3) assignment = {0: 1, 1: 1, 2: 2} - graph.nodes[0]["stat"] = 1 - graph.nodes[1]["stat"] = 2 - graph.nodes[2]["stat"] = 3 + nx_graph.nodes[0]["stat"] = 1 + nx_graph.nodes[1]["stat"] = 2 + nx_graph.nodes[2]["stat"] = 7 + + graph = Graph.from_networkx(nx_graph) updaters = {"total_stat": Tally("stat", alias="total_stat")} - partition = Partition(Graph.from_networkx(graph), assignment, updaters) - assert partition["total_stat"][2] == 3 + # This test is complicated by the fact that "original" node_ids are typically based + # on the node_ids for NX-based graphs, so in this test's case, those would be: 0, 1, 2 . + # However, when we create a Partition, we convert to an RX-based graph object and + # as a result the internal node_ids for the RX-based graph change. So, when we ask + # for graph data from a partition we need to be careful to use its internal node_ids. + + # Verify that the "total_stat" for the part (district) 2 is 7 + partition = Partition(graph, assignment, updaters) + assert partition["total_stat"][2] == 7 + + # Flip node with original node_id of 1 to be in part (district) 2 flip = {1: 2} - new_partition = partition.flip(flip) - assert new_partition["total_stat"][2] == 5 + new_partition = partition.flip(flip, use_original_node_ids=True) + + assert new_partition["total_stat"][2] == 9 def test_tally_multiple_columns(graph_with_d_and_r_cols): @@ -79,7 +140,7 @@ def test_tally_multiple_columns(graph_with_d_and_r_cols): partition = Partition(graph, assignment, updaters) expected_total_in_district_one = sum( - graph.nodes[i]["D"] + graph.nodes[i]["R"] for i in range(4) + graph.node_data(i)["D"] + graph.node_data(i)["R"] for i in range(4) ) assert partition["total"][1] == expected_total_in_district_one @@ -103,12 +164,13 @@ def test_vote_proportion_updater_returns_percentage_or_nan(partition_with_electi def test_vote_proportion_returns_nan_if_total_votes_is_zero(three_by_three_grid): + election = Election("Mock Election", ["D", "R"], alias="election") graph = three_by_three_grid for node in graph.nodes: for col in election.columns: - graph.nodes[node][col] = 0 + graph.node_data(node)[col] = 0 updaters = {"election": election} assignment = random_assignment(graph, 3) @@ -179,12 +241,41 @@ def test_election_result_has_a_cute_str_method(): assert str(results) == expected +def _convert_dict_of_set_of_rx_node_ids_to_set_of_nx_node_ids(dict_of_set_of_rx_nodes, nx_to_rx_node_id_map): + + # frm: TODO: This way to convert node_ids is clumsy and inconvenient. Think of something better... + + # When we create a partition from an NX based Graph we convert it to be an + # RX based Graph which changes the node_ids of the graph. If one wants + # to convert sets of RX based graph node_ids back to the node_ids in the + # original NX Graph, then we can do so by taking advantage of the + # nx_to_rx_node_id_map that is generated and saved when we converted the + # NX based graph to be based on RX + # + # This routine converts the data that some updaters create - namely a mapping from + # partitions to a set of node_ids. + + converted_set = {} + if nx_to_rx_node_id_map is not None: # means graph was converted from NX + # reverse the map + rx_to_nx_node_id_map = {value: key for key, value in nx_to_rx_node_id_map.items()} + converted_set = {} + for part, set_of_rx_nodes in dict_of_set_of_rx_nodes.items(): + converted_set_of_rx_nodes = {rx_to_nx_node_id_map[rx_node_id] for rx_node_id in set_of_rx_nodes} + converted_set[part] = converted_set_of_rx_nodes + # converted_set = { + # part: {rx_to_nx_node_id_map[rx_node_id]} + # for part, set_of_rx_node_ids in dict_of_set_of_rx_nodes.items() + # for rx_node_id in set_of_rx_node_ids + # } + return converted_set + def test_exterior_boundaries_as_a_set(three_by_three_grid): graph = three_by_three_grid for i in [0, 1, 2, 3, 5, 6, 7, 8]: - graph.nodes[i]["boundary_node"] = True - graph.nodes[4]["boundary_node"] = False + graph.node_data(i)["boundary_node"] = True + graph.node_data(4)["boundary_node"] = False assignment = {0: 1, 1: 1, 2: 2, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2} updaters = { @@ -194,27 +285,63 @@ def test_exterior_boundaries_as_a_set(three_by_three_grid): partition = Partition(graph, assignment, updaters) result = partition["exterior_boundaries_as_a_set"] - assert result[1] == {0, 1, 3} and result[2] == {2, 5, 6, 7, 8} - # 112 111 - # 112 -> 121 - # 222 222 - flips = {4: 2, 2: 1, 5: 1} + # frm: TOdO: Come up with a nice way to convert the result which uses + # RX based node_ids back to the original NX based node_ids... - new_partition = Partition(parent=partition, flips=flips) + # If the original graph that the partition was based on was an NX graph + # then we need to convert the RX node_ids in the partition's graph + # back to what they were in the NX graph. + nx_to_rx_node_id_map = partition.graph.get_nx_to_rx_node_id_map() + if nx_to_rx_node_id_map is not None: + converted_result = _convert_dict_of_set_of_rx_node_ids_to_set_of_nx_node_ids(result, nx_to_rx_node_id_map) + result = converted_result + + assert result[1] == {0, 1, 3} and result[2] == {2, 5, 6, 7, 8} + + # Flip nodes and then recompute partition + # boundaries to make sure the updater works properly. + # The new partition map will look like this: + # + # 112 111 + # 112 -> 121 + # 222 222 + # + # In terms of the original NX graph's node_ids, we would + # do the following flips: 4->2, 2->1, and 5->1 + # + # However, the node_ids in the partition's graph have changed due to + # conversion to RX, so we need to translate the flips into RX node_ids + + nx_flips = {4: 2, 2: 1, 5: 1} + rx_to_nx_node_id_map = {v: k for k,v in nx_to_rx_node_id_map.items()} + rx_flips = {rx_to_nx_node_id_map[nx_node_id]: part for nx_node_id, part in nx_flips.items()} + + new_partition = Partition(parent=partition, flips=rx_flips) result = new_partition["exterior_boundaries_as_a_set"] + # If the original graph that the partition was based on was an NX graph + # then we need to convert the RX node_ids in the partition's graph + # back to what they were in the NX graph. + nx_to_rx_node_id_map = new_partition.graph.get_nx_to_rx_node_id_map() + if nx_to_rx_node_id_map is not None: + converted_result = _convert_dict_of_set_of_rx_node_ids_to_set_of_nx_node_ids(result, nx_to_rx_node_id_map) + result = converted_result + assert result[1] == {0, 1, 2, 3, 5} and result[2] == {6, 7, 8} def test_exterior_boundaries(three_by_three_grid): + + # frm: TODO: Need to deal with NX vs. RX node_ids here - look at the other test_exterior_boundaries test + graph = three_by_three_grid for i in [0, 1, 2, 3, 5, 6, 7, 8]: - graph.nodes[i]["boundary_node"] = True - graph.nodes[i]["boundary_perim"] = 2 - graph.nodes[4]["boundary_node"] = False + graph.node_data(i)["boundary_node"] = True + graph.node_data(i)["boundary_perim"] = 2 + graph.node_data(4)["boundary_node"] = False assignment = {0: 1, 1: 1, 2: 2, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2} updaters = { @@ -229,9 +356,15 @@ def test_exterior_boundaries(three_by_three_grid): # 112 111 # 112 -> 121 # 222 222 - flips = {4: 2, 2: 1, 5: 1} + flips = {4: 2, 2: 1, 5: 1} + + # Convert the flips into internal node_ids + internal_flips = {} + for node_id, part in flips.items(): + internal_node_id = partition.graph.internal_node_id_for_original_node_id(node_id) + internal_flips[internal_node_id] = part - new_partition = Partition(parent=partition, flips=flips) + new_partition = Partition(parent=partition, flips=internal_flips) result = new_partition["exterior_boundaries"] @@ -241,13 +374,28 @@ def test_exterior_boundaries(three_by_three_grid): def test_perimeter(three_by_three_grid): graph = three_by_three_grid for i in [0, 1, 2, 3, 5, 6, 7, 8]: - graph.nodes[i]["boundary_node"] = True - graph.nodes[i]["boundary_perim"] = 1 - graph.nodes[4]["boundary_node"] = False + graph.node_data(i)["boundary_node"] = True + # frm: TODO: Update test - boundary_perim should be 2 for corner nodes... + graph.node_data(i)["boundary_perim"] = 1 + graph.node_data(4)["boundary_node"] = False for edge in graph.edges: - graph.edges[edge]["shared_perim"] = 1 + graph.edge_data(edge)["shared_perim"] = 1 + + """ + frm: TODO: BIG bug/issue here - assignments break when converting to RX + The problem is that RX renumbers nodes when it converts an NX graph to RX. It + does this so that it can be sure that there are no gaps - and also because sometimes + node_ids in NX are not integers. In any event, that means that any assignment + for a Partition needs to have its node_ids (from NX) converted to be whatever RX + decided to use for the new node_ids. + + I am not sure how to do this, because it does not appear that RX saves the NX + node_ids. Need to check that, though... + + HMMMMM.... + """ assignment = {0: 1, 1: 1, 2: 2, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2} updaters = { "exterior_boundaries": exterior_boundaries, @@ -275,6 +423,22 @@ def reject_half_of_all_flips(partition): def test_elections_match_the_naive_computation(partition_with_election): + + # frm: TODO: This test fails - find out why. + + """ + The pytest output follows: + + File "/Users/fred/Documents/_play/_python/_redistricting/_gerrychain/_rustworkx_work/GerryChain/tests/updaters/test_updaters.py", line 391, in test_elections_match_the_naive_computation + assert expected_party_totals == election_view.totals_for_party +AssertionError: assert {'D': {0: 119...2268, 2: 162}} == {'D': {0: 119...: 2430, 2: 0}} + + Differing items: + {'D': {0: 1191, 1: 2946, 2: 152}} != {'D': {0: 1191, 1: 3098, 2: 0}} + {'R': {0: 1171, 1: 2268, 2: 162}} != {'R': {0: 1171, 1: 2430, 2: 0}} + + """ + chain = MarkovChain( propose_random_flip, Validator([no_vanishing_districts, reject_half_of_all_flips]), @@ -294,6 +458,6 @@ def test_elections_match_the_naive_computation(partition_with_election): def expected_tally(partition, column): return { - part: sum(partition.graph.nodes[node][column] for node in nodes) + part: sum(partition.graph.node_data(node)[column] for node in nodes) for part, nodes in partition.parts.items() }