unitedstates · JoshData · May 14, 2022 · May 14, 2022 · May 14, 2022 · May 14, 2022
diff --git a/tasks/bill_info.py b/tasks/bill_info.py
@@ -847,6 +847,32 @@ def parse_bill_action(action_dict, prev_status, bill_id, title):
         if new_status:
             status = new_status
 
+    m = re.search(r"Pursuant to .* the following bills passed under suspension of the rules: (.*)\.$", line, re.I)
+    if m:
+        # The list should certainly include this bill, but was it passed "as amended"?
+        as_amended = None
+        bill_list = m.group(1)
+        bill_list = bill_list.replace("and the following resolution was agreed to under suspension of the rules: ", "")
+        bill_list = bill_list.replace("and the following resolutions were agreed to under suspension of the rules: ", "")
+        bill_list = bill_list.replace("and ", "")
+        bill_list = re.split(r"\s*(?:;|,(?! as amended))\s*", bill_list)
+        for bill_item in bill_list:
+            bill_item = bill_item.lower().replace(".", "").replace(" ", "").split(",")
+            if bill_item[0] == (bill_type + number):
+                as_amended = len(bill_item) > 1
+        if as_amended is None: raise ValueError("Did not find bill in list: " + line)
+
+        vote_type = "vote" if (bill_type[0] == "h") else "vote2"
+        pass_fail = "pass"
+        action["type"] = "vote"
+        action["vote_type"] = vote_type
+        action["how"] = "by special rule"
+        action["where"] = "h"
+        action["result"] = pass_fail
+        new_status = new_status_after_vote(vote_type, pass_fail == "pass", "h", bill_type, False, as_amended, title, prev_status)
+        if new_status:
+            status = new_status
+
     # House motions to table adversely dispose of a pending matter, if agreed to. An agreed-to "motion to table the measure",
     # which is very infrequent, kills the legislation. If not agreed to, nothing changes. So this regex only captures
     # agreed-to motions to table.
@@ -860,7 +886,7 @@ def parse_bill_action(action_dict, prev_status, bill_id, title):
 
         # In order to classify this as resulting in the same thing as regular failed vote on passage, new_status_after_vote
         # needs to know if this was a vote in the originating chamber or not.
-        if prev_status == "INTRODUCED" or bill_id.startswith("hres"):
+        if prev_status in ("INTRODUCED", "REPORTED") or bill_id.startswith("hres"):
             vote_type = "vote"
         elif False:
             vote_type = "vote2"
@@ -999,27 +1025,25 @@ def parse_bill_action(action_dict, prev_status, bill_id, title):
         if new_status:
             status = new_status
 
-    # PSUDO-REPORTING (because GovTrack did this, but should be changed)
-
-    # TODO: Make a new status for this as pre-reported.
-    m = re.search(r"Placed on (the )?([\w ]+) Calendar( under ([\w ]+))?[,\.] Calendar No\. (\d+)\.|Committee Agreed to Seek Consideration Under Suspension of the Rules|Ordered to be Reported", line, re.I)
+    # Useless. But GovTrack has had it.
+    m = re.search(r"Placed on (the )?([\w ]+) Calendar( under ([\w ]+))?[,\.] Calendar No\. (\d+)\.", line, re.I)
     if m != None:
-        # TODO: This makes no sense.
-        if prev_status in ("INTRODUCED", "REFERRED"):
-            status = "REPORTED"
-
         action["type"] = "calendar"
-
-        # TODO: Useless. But good for GovTrack compatibility.
-        if m.group(2):  # not 'Ordered to be Reported'
-            action["calendar"] = m.group(2)
-            action["under"] = m.group(4)
-            action["number"] = m.group(5)
+        action["calendar"] = m.group(2)
+        action["under"] = m.group(4)
+        action["number"] = m.group(5)
 
     # COMMITTEE ACTIONS
 
+    # Ordered Reported (because GovTrack did this, but maybe should be changed to not combine with actual reported bills)
+    m = re.search(r"Ordered to be Reported|Committee Agreed to Seek Consideration Under Suspension of the Rules", line, re.I)
+    if m != None:
+        action["type"] = "ordered-reported"
+        if prev_status in ("INTRODUCED", "REFERRED"):
+            status = "REPORTED"
+
     # reported
-    m = re.search(r"Committee on (.*)\. Reported by", line, re.I)
+    m = re.search(r"Committee on (.*)\. (Original measure )?[Rr]eported (to Senate )?by", line, re.I)
     if m != None:
         action["type"] = "reported"
         action["committee"] = m.group(1)

diff --git a/tasks/bills.py b/tasks/bills.py
@@ -22,6 +22,9 @@ def run(options):
     if bill_id:
         to_fetch = bill_id.split(",")
     else:
+        if options.get("matching_action_regex"):
+            options["matching_action_regex"] = re.compile(options["matching_action_regex"])
+
         to_fetch = get_bills_to_process(options)
 
         if not to_fetch:
@@ -64,6 +67,12 @@ def filter_ints(seq):
                     # Not an integer.
                     continue
         congresses = sorted(filter_ints(os.listdir(get_data_path())))
+
+        # If we're reprocessing actions, start with the 93rd Congress.
+        # Before that we may have bill data from other sources that don't
+        # conform to the usual action parsing logic.
+        if options.get("reparse_actions"):
+            congresses = filter(lambda c : c >= 93, congresses)
     else:
         congresses = sorted([int(c) for c in options['congress'].split(',')])
 
@@ -74,28 +83,42 @@ def filter_ints(seq):
 
         # walk through all bill types in that congress
         # (sort by bill type so that we proceed in a stable order each run)
-
-        bill_types = [bill_type for bill_type in os.listdir(get_data_path(congress)) if not bill_type.startswith(".")]
+        path = get_data_path(congress)
+        if not os.path.exists(path): continue
+        bill_types = [bill_type for bill_type in os.listdir(path) if not bill_type.startswith(".")]
 
         for bill_type in sorted(bill_types):
 
             # walk through each bill in that congress and bill type
             # (sort by bill number so that we proceed in a normal order)
-
-            bills = [bill for bill in os.listdir(get_data_path(congress, bill_type)) if not bill.startswith(".")]
+            path = get_data_path(congress, bill_type)
+            if not os.path.exists(path): continue
+            bills = [bill for bill in os.listdir(path) if not bill.startswith(".")]
             for bill_type_and_number in sorted(
                 bills,
                 key = lambda x : int(x.replace(bill_type, ""))
                 ):
 
+                bill_id = bill_type_and_number + "-" + congress
+
+                if options.get("matching_action_regex"):
+                    # Include bills that have an action that matches a regular expression.
+                    fn = get_data_path(congress, bill_type, bill_type_and_number, "data.json")
+                    if os.path.exists(fn):
+                        with open(fn) as f:
+                            bill = json.load(f)
+                            for action in bill['actions']:
+                                if action.get('text') and options["matching_action_regex"].search(action['text']):
+                                    yield bill_id
+                    continue # don't check modification dates
+
                 fn = get_data_path(congress, bill_type, bill_type_and_number, govinfo.FDSYS_BILLSTATUS_FILENAME)
                 if os.path.exists(fn):
                     # The GovInfo.gov bulk data file exists. Does our JSON data
                     # file need to be updated?
                     bulkfile_lastmod = utils.read(fn.replace(".xml", "-lastmod.txt"))
                     parse_lastmod = utils.read(get_data_path(congress, bill_type, bill_type_and_number, "data-fromfdsys-lastmod.txt"))
                     if bulkfile_lastmod != parse_lastmod or options.get("force"):
-                        bill_id = bill_type_and_number + "-" + congress
                         yield bill_id
 
 def process_bill(bill_id, options):
@@ -115,7 +138,10 @@ def process_bill(bill_id, options):
     # Convert and write out data.json and data.xml.
     utils.write(
         json.dumps(bill_data, indent=2, sort_keys=True),
-        os.path.dirname(fdsys_xml_path) + '/data.json')
+        os.path.dirname(fdsys_xml_path) + '/data.json',
+        {
+            "diff": options.get("diff")
+        })
 
     from bill_info import create_govtrack_xml
     with open(os.path.dirname(fdsys_xml_path) + '/data.xml', 'wb') as xml_file:
@@ -128,7 +154,10 @@ def process_bill(bill_id, options):
     # file under a new path.
     utils.write(
         utils.read(_path_to_billstatus_file(bill_id).replace(".xml", "-lastmod.txt")),
-        os.path.join(os.path.dirname(fdsys_xml_path), "data-fromfdsys-lastmod.txt"))
+        os.path.join(os.path.dirname(fdsys_xml_path), "data-fromfdsys-lastmod.txt"),
+        {
+            "diff": options.get("diff")
+        })
 
     return {
         "ok": True,
@@ -239,42 +268,50 @@ def process_amendments(bill_id, bill_amendments, options):
 def reparse_actions(bill_id, options):
     # Load an existing bill status JSON file.
     data_json_fn = output_for_bill(bill_id, 'json')
+    if not os.path.exists(data_json_fn):
+        return {
+            "ok": True,
+            "saved": False,
+            "reason": "no file",
+        }
     source = utils.read(data_json_fn)
     bill_data = json.loads(source)
 
     # Munge data.
     from bill_info import parse_bill_action
     title = bill_info.current_title_for(bill_data['titles'], 'official')
-    old_status = None
+    old_status = "INTRODUCED"
     for action in bill_data['actions']:
       new_action, new_status = parse_bill_action(action, old_status, bill_id, title)
       if new_status:
         old_status = new_status
         action['status'] = new_status
+      elif 'status' in action:
+        del action['status']
       # clear out deleted keys
       for key in ('vote_type', 'how', 'where', 'result', 'roll', 'suspension', 'calendar', 'under', 'number', 'committee', 'pocket', 'law', 'congress'):
         if key in action and key not in new_action:
-          del action['key']
+          del action[key]
       action.update(new_action)
 
     status, status_date = bill_info.latest_status(bill_data['actions'], bill_data['introduced_at'])
     bill_data['status'] = status
     bill_data['status_at'] = status_date
 
-    # Show user a diff on the console to accept changes.
-    def show_diff_ask_ok(source, revised, fn):
-      if source == revised: return False # nothing to do
-      def split_lines(s): return [l+"\n" for l in s.split("\n")]
-      import sys
-      from difflib import unified_diff
-      sys.stdout.writelines(unified_diff(split_lines(source), split_lines(revised), fromfile=fn, tofile=fn))
-      return input("Apply change? (y/n) ").strip() == "y"
-
     wrote_any = False
 
+    if options.get("diff"):
+        confirmer = utils.show_diff_ask_ok
+    else:
+        # If no --diff is given, just check that
+        # the content hasn't changed --- don't bother
+        # writing out anything with identical content.
+        def confirmer(source, revised, fn):
+            return source != revised
+
     # Write new data.json file.
     revised = json.dumps(bill_data, indent=2, sort_keys=True)
-    if show_diff_ask_ok(source, revised, data_json_fn):
+    if confirmer(source, revised, data_json_fn):
       utils.write(revised, data_json_fn)
       wrote_any = True
 
@@ -284,7 +321,7 @@ def split_lines(s): return [l+"\n" for l in s.split("\n")]
     with open(data_xml_fn, 'r') as xml_file:
         source = xml_file.read()
     revised = create_govtrack_xml(bill_data, options)
-    if show_diff_ask_ok(source, revised.decode("utf8"), data_xml_fn):
+    if confirmer(source, revised.decode("utf8"), data_xml_fn):
       with open(data_xml_fn, 'wb') as xml_file:
         xml_file.write(revised)
       wrote_any = True
@@ -294,4 +331,3 @@ def split_lines(s): return [l+"\n" for l in s.split("\n")]
         "saved": wrote_any,
         "reason": "no changes or changes skipped by user",
     }
-
diff --git a/tasks/utils.py b/tasks/utils.py
@@ -347,31 +347,26 @@ def write(content, destination, options={}):
     if options.get("diff"):
         # Instead of writing the file, do a comparison with what's on disk
         # to test any changes. But be nice and replace any update date with
-        # what's in the previous file so we avoid spurrious changes. Use
-        # how updated_at appears in the JSON and in the XML.
+        # what's in the previous file so we avoid spurrious changes in the
+        # diff. Use how updated_at appears in the JSON and in the XML.
         if os.path.exists(destination):
             with open(destination) as f:
-                existing_content = f.read()
+                source = f.read()
+            revised = content
             for pattern in ('"updated_at": ".*?"', 'updated=".*?"'):
-                m1 = re.search(pattern, existing_content)
-                m2 = re.search(pattern, content)
+                m1 = re.search(pattern, source)
+                m2 = re.search(pattern, revised)
                 if m1 and m2:
-                    content = content.replace(m2.group(0), m1.group(0))
+                    revised = revised.replace(m2.group(0), m1.group(0))
 
             # Avoid writing to disk and spawning `diff` by checking if
             # the files match in memory.
-            if content == existing_content:
+            if revised == source:
                 return
 
-        # Shell `diff` and let it display output directly to the console.
-        # Write `content` to disk first so diff can see it. Maybe more
-        # efficient to pipe?
-        fn = "/tmp/congress-changed-file"
-        with open(fn, 'w') as f:
-            f.write(content)
-        os.system("diff -u %s %s" % (destination, fn))
-        os.unlink(fn)
-        return
+            if not show_diff_ask_ok(source, revised, destination):
+                # User cancelled save.
+                return
 
     # Save the content to disk.
     mkdir_p(os.path.dirname(destination))
@@ -382,6 +377,19 @@ def write(content, destination, options={}):
         f.write(content)
     f.close()
 
+
+def show_diff_ask_ok(source, revised, fn):
+    # Show user a diff on the console to accept changes.
+    source = re.sub(r"\s*\n", "\n", source) # old files had trailing spaces
+    revised = re.sub(r"\s*\n", "\n", revised) # be consistent in normalization
+    if source == revised: return False # nothing to do
+    def split_lines(s): return [l+"\n" for l in s.split("\n")]
+    import sys
+    from difflib import unified_diff
+    sys.stdout.writelines(unified_diff(split_lines(source), split_lines(revised), fromfile=fn, tofile=fn))
+    return input("Apply change? (y/n) ").strip() == "y"
+
+
 def write_json(data, destination):
     return write(
         json.dumps(data,