Add a script for backing up Github issues.

5bf5aa63 · Stefan Behnel · 6e2d9fd2 · 5bf5aa63
Commit 5bf5aa63 authored Jul 16, 2021 by Stefan Behnel
Hide whitespace changes
Inline Side-by-side

Showing with 142 additions and 0 deletions

Tools/dump_github_issues.py Tools/dump_github_issues.py +142 -0

No files found.
--- a/Tools/dump_github_issues.py
+++ b/Tools/dump_github_issues.py
+"""
+Dump the GitHub issues of the current project to a file (.json.gz).
+
+Usage:  python3 Tools/dump_github_issues.py
+"""
+
+import configparser
+import gzip
+import json
+import os.path
+
+from datetime import datetime
+from urllib.request import urlopen
+
+GIT_CONFIG_FILE = ".git/config"
+
+
+class RateLimitReached(Exception):
+    pass
+
+
+def gen_urls(repo):
+    i = 0
+    while True:
+        yield f"https://api.github.com/repos/{repo}/issues?state=all&per_page=100&page={i}"
+        i += 1
+
+
+def read_rate_limit():
+    with urlopen("https://api.github.com/rate_limit") as p:
+        return json.load(p)
+
+
+def parse_rate_limit(limits):
+    limits = limits['resources']['core']
+    return limits['limit'], limits['remaining'], datetime.fromtimestamp(limits['reset'])
+
+
+def load_url(url):
+    with urlopen(url) as p:
+        data = json.load(p)
+    if isinstance(data, dict) and 'rate limit' in data.get('message', ''):
+        raise RateLimitReached()
+
+    assert isinstance(data, list), type(data)
+    return data or None  # None indicates empty last page
+
+
+def join_list_data(lists):
+    result = []
+    for data in lists:
+        if not data:
+            break
+        result.extend(data)
+    return result
+
+
+def output_filename(repo):
+    timestamp = datetime.now()
+    return f"github_issues_{repo.replace('/', '_')}_{timestamp.strftime('%Y%m%d_%H%M%S')}.json.gz"
+
+
+def write_gzjson(file_name, data, indent=2):
+    with gzip.open(file_name, "wt", encoding='utf-8') as gz:
+        json.dump(data, gz, indent=indent)
+
+
+def find_origin_url(git_config=GIT_CONFIG_FILE):
+    assert os.path.exists(git_config)
+    parser = configparser.ConfigParser()
+    parser.read(git_config)
+    return parser.get('remote "origin"', 'url')
+
+
+def parse_repo_name(git_url):
+    if git_url.endswith('.git'):
+        git_url = git_url[:-4]
+    return '/'.join(git_url.split('/')[-2:])
+
+
+def dump_issues(repo):
+    """Main entry point."""
+    print(f"Reading issues from repo '{repo}'")
+    urls = gen_urls(repo)
+    try:
+        paged_data = map(load_url, urls)
+        issues = join_list_data(paged_data)
+    except RateLimitReached:
+        limit, remaining, reset_time = parse_rate_limit(read_rate_limit())
+        print(f"FAILURE: Rate limits ({limit}) reached, remaining: {remaining}, reset at {reset_time}")
+        return
+
+    filename = output_filename(repo)
+    print(f"Writing {len(issues)} to {filename}")
+    write_gzjson(filename, issues)
+
+
+### TESTS
+
+def test_join_list_data():
+    assert join_list_data([]) == []
+    assert join_list_data([[1,2]]) == [1,2]
+    assert join_list_data([[1,2], [3]]) == [1,2,3]
+    assert join_list_data([[0], [1,2], [3]]) == [0,1,2,3]
+    assert join_list_data([[0], [1,2], [[[]],[]]]) == [0,1,2,[[]],[]]
+
+
+def test_output_filename():
+    filename = output_filename("re/po")
+    import re
+    assert re.match(r"github_issues_re_po_[0-9]{8}_[0-9]{6}\.json", filename)
+
+
+def test_find_origin_url():
+    assert find_origin_url()
+
+
+def test_parse_repo_name():
+    assert parse_repo_name("https://github.com/cython/cython") == "cython/cython"
+    assert parse_repo_name("git+ssh://git@github.com/cython/cython.git") == "cython/cython"
+    assert parse_repo_name("git+ssh://git@github.com/fork/cython.git") == "fork/cython"
+
+
+def test_write_gzjson():
+    import tempfile
+    with tempfile.NamedTemporaryFile() as tmp:
+        write_gzjson(tmp.name, [{}])
+
+        # test JSON format
+        with gzip.open(tmp.name) as f:
+            assert json.load(f) == [{}]
+
+        # test indentation
+        with gzip.open(tmp.name) as f:
+            assert f.read() == b'[\n  {}\n]'
+
+
+### MAIN
+
+if __name__ == '__main__':
+    repo_name = parse_repo_name(find_origin_url())
+    dump_issues(repo_name)