Source code for ppx.factory

"""The PXDataset class and its associated methods.

This is the foundation of the ppx package.
"""

import logging
import re
from urllib.parse import urlparse

import requests

from .massive import MassiveProject
from .pride import PrideProject

LOGGER = logging.getLogger(__name__)


class PXDFactory:
    """Retrieve information about a ProteomeXchange project.

    Parameters
    ----------
    pxid : str
        A ProteomeXchange identifier.
    local : str, pathlib.Path, or cloudpathlib.CloudPath, optional
        The local data directory in which the project files will be
        downloaded. In addition to local paths, paths to AWS S3,
        Google Cloud Storage, or Azure Blob Storage can be used.
    fetch : bool, optional
        Should ppx check the remote repository for updated metadata?
    timeout : float, optional
        The maximum amount of time to wait for a server response.

    """

    rest = "http://proteomecentral.proteomexchange.org/cgi/GetDataset"

    repo_cv = {
        "MS:1002487": "MassIVE",
        "MS:1002632": "jPOST",
        "MS:1002836": "iProx",
        "MS:1002872": "Panorama",
    }

    ftp_map = {
        "ftp.pride.ebi.ac.uk": "PRIDE",
        "massive-ftp.ucsd.edu": "MassIVE",
    }

    def __init__(self, pxid, local=None, fetch=False, timeout=10.0):
        """Instantiate a PXDataset"""
        self._id = self._validate_id(pxid)
        self._local = local
        self._fetch = fetch
        self._timeout = timeout

        # Retrieve the data:
        params = {"ID": self.id, "outputMode": "JSON", "test": "no"}
        res = requests.get(self.rest, params=params, timeout=self._timeout)
        if res.status_code != 200:
            raise requests.HTTPError(f"Error {res.status_code}: {res.text}")

        self._url = res.url
        self._data = res.json()

        # Determine the partner repository
        self._repo, self._repo_id = self._resolve_repo()
        if self._repo_id != self.id:
            LOGGER.warning(
                "Repository ID %s differed from provided ID %s",
                self._repo_id,
                self.id,
            )

    @property
    def id(self):
        """The ProteomeXchange project identifier"""
        return self._id

    def find(self):
        """Find the dataset at the partner repository"""
        kwargs = {
            "local": self._local,
            "fetch": self._fetch,
            "timeout": self._timeout,
        }

        if self._repo == "PRIDE":
            return PrideProject(self._repo_id, **kwargs)

        if self._repo == "MassIVE":
            return MassiveProject(self._repo_id, **kwargs)

        raise RuntimeError("Unsupported partner repository.")

    def _resolve_repo(self):
        """Resolve what repository the data is in."""
        repos = {}
        for repo_id in self._data["identifiers"]:
            repo_name = self.repo_cv.get(repo_id["accession"])
            if repo_name is not None:
                repos[repo_name] = repo_id["value"]

        for link in self._data["fullDatasetLinks"]:
            if link["name"] == "Dataset FTP location":
                repo_name = self.ftp_map.get(urlparse(link["value"]).netloc)
                if repo_name == "PRIDE":
                    repos["PRIDE"] = self.id

        if not repos:
            raise ValueError(
                "No supported ProteomeXchange partner repository was found for"
                f" {self.id}. ppx currently supports PRIDE and MassIVE."
            )

        repo_order = ["PRIDE", "MassIVE"]
        for repo in repo_order:
            repo_id = repos.get(repo)
            if repo_id is not None:
                return repo, repo_id

        repo = ", ".join(repos.keys())
        raise ValueError(
            f"{self.id} was found in {repo}, which is not supported by ppx at "
            "this time. ppx currently supports PRIDE and MassIVE."
        )

    def _validate_id(self, identifier):
        """Validate a ProteomeXchange identifier"""
        identifier = str(identifier).upper()
        if not re.match("P[XR]D[0-9]{6}", identifier):
            raise ValueError("Malformed ProteomeXchange identifier.")

        return identifier



[docs]
def find_project(identifier, local=None, repo=None, fetch=False, timeout=10.0):
    """Find a project in the PRIDE or MassIVE repositories.

    Parameters
    ----------
    identifier : str
        The project identifier.
    local :  str, pathlib.Path, or cloudpathlib.CloudPath, optional
        The local data directory in which the project files will be
        downloaded. In addition to local paths, paths to AWS S3,
        Google Cloud Storage, or Azure Blob Storage can be used.
        The default is :code:`~/.ppx`
    repo : {"pride", "massive"}, optional
        The repository in which to look for the project. If :code:`None`,
        ppx will try to figure it out.
    fetch : bool, optional
        Should ppx check the remote repository for updated metadata?
    timeout : float, optional
        The maximum amount of time to wait for a server response

    Returns
    -------
    :py:class:`~ppx.PrideProject` or :py:class:`~ppx.MassiveProject`
        An object to interact with the project data in the repository.

    """
    identifier = str(identifier).upper()
    if repo is not None:
        repo = str(repo).lower()

    # User-specified:
    kwargs = {"local": local, "fetch": fetch, "timeout": timeout}
    if repo == "pride":
        return PrideProject(identifier, **kwargs)

    if repo == "massive":
        return MassiveProject(identifier, **kwargs)

    if repo is not None:
        raise ValueError("Unsupported repository.")

    # Try and figure it out:
    if identifier.startswith("MSV") or identifier.startswith("RMSV"):
        return MassiveProject(identifier, **kwargs)

    if re.match("P[XR]D", identifier):
        try:
            return PXDFactory(identifier, **kwargs).find()
        except requests.HTTPError:
            return PrideProject(identifier, **kwargs)

    raise ValueError("Malformed identifier.")