Commit aad6e6a1 authored by Cyril Matthey-Doret's avatar Cyril Matthey-Doret
Browse files

add fork collection scripts

parent db3fcdae
Pipeline #327043 passed with stage
in 14 seconds
#!/usr/bin/env bash
# Reads JSON output from (either as a file or in stdin)
# and clone all target forks into provided directory at the target commit.
# usage ./ OUT_DIR forks.json
# ./ --token token.asc URL | ./ OUT_DIR
# Help message
function usage () {
cat <<EOF
$(basename $0) [outdir] [in_file.json]
./ --token token.asc URL | $(basename $0) [outdir]
Reads json output from and clone all target forks into provided directory
at the target commit. Repositories are cloned into outdir/namespace.
outdir: Directory where all forks will be cloned [default: .]
in_file.json: JSON output of containing fork metadata [default: stdin]
exit 0
# Parsing CL arguments
if [[ $# -gt 2 ]] || [[ $1 == '-h' ]] || [[ $1 == '--help' ]]; then
mkdir -p ${OUT_DIR}
jq ".[] | \"git clone \(.url) $OUT_DIR/\(.group) && cd ${OUT_DIR}/\(.group) && git checkout \(.commit)\"" \
< ${JSON} \
| xargs -L 1 -I {} sh -c "{}"
#!/use/bin/env python3
# Collect all group-owned forks of a given project using Gitlab API
# Forks metadata is sent to stdout and contains the following fields:
# - id (gitlab identifier)
# - url
# - group
# - members (username, full name and email of each member)
# - commit (the last commit before specified deadline)
# - autostart url (the URL to start a renku session at the last pre-deadline commit_
import re
from typing import Tuple, List, Dict, Optional
import json
import requests
import click
from datetime import datetime
import pytz
# Replace URL
def validate_iso_date(date: str) -> str:
"""Check that input string is in ISO-8601 format and
keep only year, month and day informations"""
_ = datetime.fromisoformat(date)
except ValueError:
raise ValueError("Deadline must be in ISO-8601 format.")
def parse_repo_url(url: str) -> Tuple[str, str, str]:
"""Decompose a full repo URL into 3 parts:
- the organization base URL
- the namespace (i.e. groups and subgroups
- the name of the repository
>>> parse_repo_url('')
('', 'group/subgroup', 'repo')
>>> parse_repo_url('')
('', 'group', 'repo')
regex = re.compile(
captured = re.match(regex, url).groupdict()
base, namespace, repo = [captured[group] for group in ["base", "namespace", "repo"]]
namespace = namespace.strip("/")
return base, namespace, repo
def get_project_id(project_url: str, header=Dict[str, str]) -> int:
"""Given a project's URL, return it's gitlab ID"""
base, namespace, repo = parse_repo_url(project_url)
project = []
page = 1
while not len(project):
resp = requests.get(
if resp.ok:
resp = resp.json()
page += 1
project = [p for p in resp if p["path_with_namespace"] == f"{namespace}/{repo}"]
if len(list(project)) > 1:
raise ValueError("More than one project matched input url")
return project[0]["id"]
def collect_forks(project_url: str, header=Dict[str, str]) -> List[Dict]:
"""Retrieve the metadata from all forks of input project"""
base, namespace, repo = parse_repo_url(project_url)
upstream_id = get_project_id(project_url, header)
# Collect all forks
page = 1
forks = []
has_content = True
while has_content:
new_forks = requests.get(
has_content = len(new_forks) > 0
forks += new_forks
page += 1
return forks
def filter_group_forks(forks: List[Dict]) -> List[Dict]:
"""Given a list of forks' metadata, only keep those that belong to a group
>>> d1 = {'id': 1, 'namespace': {'kind': 'user'}}
>>> d2 = {'id': 2, 'namespace': {'kind': 'group'}}
>>> filter_group_forks([d1, d2]) == [d2]
return [f for f in forks if f["namespace"]["kind"] == "group"]
def get_last_commit_hash(
project_id: int,
base_url: str,
header: Dict[str, str],
deadline: Optional[str] = None,
) -> str:
"""Get the hash of the last commit before a deadline. The deadline must
be a valid ISO-8601 date time."""
# There is an 'until' option in the commits API, but it seems bugged...
commits = requests.get(
f"{base_url}/api/v4/projects/{project_id}/repository/commits", headers=header,
# Commits are sorted in reverse chronological order by default
# Assumes we're in UTC
utc = pytz.UTC
loc_deadline = utc.localize(datetime.fromisoformat(deadline))
for commit in commits:
commit_date = datetime.fromisoformat(commit["authored_date"])
if (deadline is None) or (commit_date <= loc_deadline):
return commit["id"]
return None
def format_fork_metadata(
fork: Dict, header: Dict[str, str], deadline: Optional[str] = None
) -> Dict:
"""Format and add fields to a fork's metadata. The
resulting metadata will have the following fields:
id, http_url_to_repo, autostart_url, commit, members, group"""
meta = {
"id": fork["id"],
"url": fork["http_url_to_repo"],
"group": fork["namespace"]["full_path"],
# Additional API query to retrieve members metadata
members_url = fork["_links"]["members"]
members = requests.get(members_url + "/all", headers=header).json()
# Only retain relevant user fields
member_keys = ("username", "name", "email")
meta["members"] = [
{key: member.get(key) for key in member_keys} for member in members
base, namespace, repo = parse_repo_url(meta["url"])
# Get the last commit before deadline
meta["commit"] = get_last_commit_hash(meta["id"], base, header, deadline)
# Build a renku autostart url using the commit hash
autostart_url = (
f"{re.sub(r'/gitlab.*$', '', base)}"
meta["autostart_url"] = autostart_url
return meta
"--deadline", type=str, help="ISO-8601 formatted date. Example: 2022-03-29T13:10:29"
@click.argument("repo_url", type=str)
help="Armored ASCII file containing the Gitlab API token. If not provided, you will be prompted for the token.",
def main(repo_url, token, deadline=None):
if token is None:
token = click.prompt("Please enter your Gitlab API token", hide_input=True)
token = open(token).read().strip()
# Check for valid deadline format
if deadline is not None:
# Get metadata of all forks from input project
header = {"PRIVATE-TOKEN": token}
forks = collect_forks(repo_url, header)
# Only keep those which belong to a group
forks = filter_group_forks(forks)
# Reformat metadata for convenience
meta = map(lambda f: format_fork_metadata(f, header, deadline), forks)
if __name__ == "__main__":
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment