Commit aad6e6a1 authored by Cyril Matthey-Doret's avatar Cyril Matthey-Doret
Browse files

add fork collection scripts

parent db3fcdae
Pipeline #327043 passed with stage
in 14 seconds
#!/usr/bin/env bash
# Reads JSON output from collect_forks.py (either as a file or in stdin)
# and clone all target forks into provided directory at the target commit.
# usage ./clone_forks_from_json.sh OUT_DIR forks.json
# ./collect_forks.py --token token.asc URL | ./clone_forks_from_json.sh OUT_DIR
# Help message
function usage () {
cat <<EOF
Usage:
$(basename $0) [outdir] [in_file.json]
./collect_forks.py --token token.asc URL | $(basename $0) [outdir]
Reads json output from collect_forks.py and clone all target forks into provided directory
at the target commit. Repositories are cloned into outdir/namespace.
Arguments:
outdir: Directory where all forks will be cloned [default: .]
in_file.json: JSON output of collect_forks.py containing fork metadata [default: stdin]
EOF
exit 0
}
# Parsing CL arguments
OUT_DIR=${1:-.}
JSON=${2:-/dev/stdin}
if [[ $# -gt 2 ]] || [[ $1 == '-h' ]] || [[ $1 == '--help' ]]; then
usage
fi
mkdir -p ${OUT_DIR}
jq ".[] | \"git clone \(.url) $OUT_DIR/\(.group) && cd ${OUT_DIR}/\(.group) && git checkout \(.commit)\"" \
< ${JSON} \
| xargs -L 1 -I {} sh -c "{}"
#!/use/bin/env python3
# Collect all group-owned forks of a given project using Gitlab API
# Forks metadata is sent to stdout and contains the following fields:
# - id (gitlab identifier)
# - url
# - group
# - members (username, full name and email of each member)
# - commit (the last commit before specified deadline)
# - autostart url (the URL to start a renku session at the last pre-deadline commit_
import re
from typing import Tuple, List, Dict, Optional
import json
import requests
import click
from datetime import datetime
import pytz
# Replace URL
def validate_iso_date(date: str) -> str:
"""Check that input string is in ISO-8601 format and
keep only year, month and day informations"""
try:
_ = datetime.fromisoformat(date)
except ValueError:
raise ValueError("Deadline must be in ISO-8601 format.")
def parse_repo_url(url: str) -> Tuple[str, str, str]:
"""Decompose a full repo URL into 3 parts:
- the organization base URL
- the namespace (i.e. groups and subgroups
- the name of the repository
Examples
--------
>>> parse_repo_url('https://org.com/group/subgroup/repo')
('https://org.com', 'group/subgroup', 'repo')
>>> parse_repo_url('https://org.com/gitlab/group/repo')
('https://org.com/gitlab', 'group', 'repo')
"""
regex = re.compile(
(
"(?P<base>https://[^/]*(/gitlab|/projects)?)/"
"(?P<namespace>([^/]*/)*)"
"(?P<repo>[^/]*)$"
),
re.IGNORECASE,
)
captured = re.match(regex, url).groupdict()
base, namespace, repo = [captured[group] for group in ["base", "namespace", "repo"]]
namespace = namespace.strip("/")
return base, namespace, repo
def get_project_id(project_url: str, header=Dict[str, str]) -> int:
"""Given a project's URL, return it's gitlab ID"""
base, namespace, repo = parse_repo_url(project_url)
project = []
page = 1
while not len(project):
resp = requests.get(
f"{base}/api/v4/projects?search={repo}&per_page=100&page={page}",
headers=header,
)
if resp.ok:
resp = resp.json()
else:
resp.raise_for_status()
page += 1
project = [p for p in resp if p["path_with_namespace"] == f"{namespace}/{repo}"]
if len(list(project)) > 1:
raise ValueError("More than one project matched input url")
return project[0]["id"]
def collect_forks(project_url: str, header=Dict[str, str]) -> List[Dict]:
"""Retrieve the metadata from all forks of input project"""
base, namespace, repo = parse_repo_url(project_url)
upstream_id = get_project_id(project_url, header)
# Collect all forks
page = 1
forks = []
has_content = True
while has_content:
new_forks = requests.get(
f"{base}/api/v4/projects/{upstream_id}/forks?per_page=100&page={page}",
headers=header,
).json()
has_content = len(new_forks) > 0
forks += new_forks
page += 1
return forks
def filter_group_forks(forks: List[Dict]) -> List[Dict]:
"""Given a list of forks' metadata, only keep those that belong to a group
Examples
--------
>>> d1 = {'id': 1, 'namespace': {'kind': 'user'}}
>>> d2 = {'id': 2, 'namespace': {'kind': 'group'}}
>>> filter_group_forks([d1, d2]) == [d2]
True
"""
return [f for f in forks if f["namespace"]["kind"] == "group"]
def get_last_commit_hash(
project_id: int,
base_url: str,
header: Dict[str, str],
deadline: Optional[str] = None,
) -> str:
"""Get the hash of the last commit before a deadline. The deadline must
be a valid ISO-8601 date time."""
# There is an 'until' option in the commits API, but it seems bugged...
commits = requests.get(
f"{base_url}/api/v4/projects/{project_id}/repository/commits", headers=header,
).json()
# Commits are sorted in reverse chronological order by default
# Assumes we're in UTC
utc = pytz.UTC
loc_deadline = utc.localize(datetime.fromisoformat(deadline))
for commit in commits:
commit_date = datetime.fromisoformat(commit["authored_date"])
if (deadline is None) or (commit_date <= loc_deadline):
return commit["id"]
return None
def format_fork_metadata(
fork: Dict, header: Dict[str, str], deadline: Optional[str] = None
) -> Dict:
"""Format and add fields to a fork's metadata. The
resulting metadata will have the following fields:
id, http_url_to_repo, autostart_url, commit, members, group"""
meta = {
"id": fork["id"],
"url": fork["http_url_to_repo"],
"group": fork["namespace"]["full_path"],
}
# Additional API query to retrieve members metadata
members_url = fork["_links"]["members"]
members = requests.get(members_url + "/all", headers=header).json()
# Only retain relevant user fields
member_keys = ("username", "name", "email")
meta["members"] = [
{key: member.get(key) for key in member_keys} for member in members
]
base, namespace, repo = parse_repo_url(meta["url"])
# Get the last commit before deadline
meta["commit"] = get_last_commit_hash(meta["id"], base, header, deadline)
# Build a renku autostart url using the commit hash
autostart_url = (
f"{re.sub(r'/gitlab.*$', '', base)}"
f"/projects/{namespace}/{repo.removesuffix('.git')}/sessions/"
f"new?autostart=1&commit={meta['commit']}&branch=master"
)
meta["autostart_url"] = autostart_url
return meta
@click.command()
@click.option(
"--deadline", type=str, help="ISO-8601 formatted date. Example: 2022-03-29T13:10:29"
)
@click.argument("repo_url", type=str)
@click.option(
"--token",
type=click.Path(exists=True),
help="Armored ASCII file containing the Gitlab API token. If not provided, you will be prompted for the token.",
)
def main(repo_url, token, deadline=None):
if token is None:
token = click.prompt("Please enter your Gitlab API token", hide_input=True)
else:
token = open(token).read().strip()
# Check for valid deadline format
if deadline is not None:
validate_iso_date(deadline)
# Get metadata of all forks from input project
header = {"PRIVATE-TOKEN": token}
forks = collect_forks(repo_url, header)
# Only keep those which belong to a group
forks = filter_group_forks(forks)
# Reformat metadata for convenience
meta = map(lambda f: format_fork_metadata(f, header, deadline), forks)
print(json.dumps(list(meta)))
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment