Skip to content
collect_forks.py 5.18 KiB
Newer Older
#!/use/bin/env python3
# Collect all group-owned forks of a given project using Gitlab API
# Forks metadata is sent to stdout and contains the following fields:
# - id (gitlab identifier)
# - url
# - group
# - members (username, full name and email of each member)
# - commit (the last commit before specified deadline)
# - autostart url (the URL to start a renku session at the last pre-deadline commit_

import re
from typing import Tuple, List, Dict, Optional
import json
import requests
import click
from datetime import datetime
import pytz
from teach_utils.common_requests import parse_repo_url, get_project_id


def validate_iso_date(date: str) -> str:
    """Check that input string is in ISO-8601 format and
    keep only year, month and day informations"""
    try:
        _ = datetime.fromisoformat(date)
    except ValueError:
        raise ValueError("Deadline must be in ISO-8601 format.")


def collect_forks(project_url: str, header=Dict[str, str]) -> List[Dict]:
    """Retrieve the metadata from all forks of input project"""
    base, namespace, repo = parse_repo_url(project_url)
    upstream_id = get_project_id(project_url, header)
    # Collect all forks
    page = 1
    forks = []
    has_content = True
    while has_content:
        new_forks = requests.get(
            f"{base}/api/v4/projects/{upstream_id}/forks?per_page=100&page={page}",
            headers=header,
        ).json()
        has_content = len(new_forks) > 0
        forks += new_forks
        page += 1

    return forks


def filter_group_forks(forks: List[Dict]) -> List[Dict]:
    """Given a list of forks' metadata, only keep those that belong to a group
    
    Examples
    --------
    >>> d1 = {'id': 1, 'namespace': {'kind': 'user'}}
    >>> d2 = {'id': 2, 'namespace': {'kind': 'group'}}
    >>> filter_group_forks([d1, d2]) == [d2]
    True

    """
    return [f for f in forks if f["namespace"]["kind"] == "group"]


def get_last_commit_hash(
    project_id: int,
    base_url: str,
    header: Dict[str, str],
    deadline: Optional[str] = None,
) -> str:
    """Get the hash of the last commit before a deadline. The deadline must
    be a valid ISO-8601 date time."""
    # There is an 'until' option in the commits API, but it seems bugged...
    commits = requests.get(
        f"{base_url}/api/v4/projects/{project_id}/repository/commits", headers=header,
    ).json()

    # Commits are sorted in reverse chronological order by default
    # Assumes we're in UTC
    utc = pytz.UTC
    if deadline is None:
        before_deadline = True
    else:
        loc_deadline = utc.localize(datetime.fromisoformat(deadline))
    for commit in commits:
        if deadline is not None:
            commit_date = datetime.fromisoformat(commit["authored_date"])
            before_deadline = commit_date <= loc_deadline
        if before_deadline:
            return commit["id"]
    return None


def format_fork_metadata(
    fork: Dict, header: Dict[str, str], deadline: Optional[str] = None
) -> Dict:
    """Format and add fields to a fork's metadata. The 
    resulting metadata will have the following fields:
    id, http_url_to_repo, autostart_url, commit, members, visibility, group"""
    meta = {
        "id": fork["id"],
        "url": fork["http_url_to_repo"],
        "visibility": fork["visibility"],
        "group": fork["namespace"]["full_path"],
    }

    # Additional API query to retrieve members metadata
    members_url = fork["_links"]["members"]
    members = requests.get(members_url + "/all", headers=header).json()
    # Only retain relevant user fields
    member_keys = ("username", "name", "email")
    meta["members"] = [
        {key: member.get(key) for key in member_keys} for member in members
    ]

    base, namespace, repo = parse_repo_url(meta["url"])
    # Get the last commit before deadline

    meta["commit"] = get_last_commit_hash(meta["id"], base, header, deadline)

    # Build a renku autostart url using the commit hash
    autostart_url = (
        f"{re.sub(r'/gitlab.*$', '', base)}"
        f"/projects/{namespace}/{repo.removesuffix('.git')}/sessions/"
        f"new?autostart=1&commit={meta['commit']}&branch=master"
    )
    meta["autostart_url"] = autostart_url

    return meta


@click.command()
@click.option(
    "--deadline", type=str, help="ISO-8601 formatted date. Example: 2022-03-29T13:10:29"
)
@click.argument("repo_url", type=str)
@click.option(
    "--token",
    type=click.Path(exists=True),
    help="Path to a file containing the Gitlab API token. If not provided, you will be prompted for the token.",
)
def main(repo_url, token, deadline=None):
    if token is None:
        token = click.prompt("Please enter your Gitlab API token", hide_input=True)
    else:
        token = open(token).read().strip()
    # Check for valid deadline format
    if deadline is not None:
        validate_iso_date(deadline)

    # Get metadata of all forks from input project
    header = {"PRIVATE-TOKEN": token}
    forks = collect_forks(repo_url, header)
    # Only keep those which belong to a group
    forks = filter_group_forks(forks)

    # Reformat metadata for convenience
    meta = map(lambda f: format_fork_metadata(f, header, deadline), forks)
    print(json.dumps(list(meta)))


if __name__ == "__main__":
    main()