Source code for ppx.massive

"""MassIVE datasets."""

import logging
import re
import socket
import xml.etree.ElementTree as ET  # noqa: N817
from pathlib import Path

import requests

from .ftp import FTPParser
from .project import BaseProject

LOGGER = logging.getLogger(__name__)


[docs] class MassiveProject(BaseProject): """Retrieve information about a MassIVE project. MassIVE: `<https://massive.ucsd.edu>`_ Parameters ---------- msv_id : str The MassIVE identifier. local : str, pathlib.Path, or cloudpathlib.CloudPath, optional The local data directory in which the project files will be downloaded. In addition to local paths, paths to AWS S3, Google Cloud Storage, or Azure Blob Storage can be used. fetch : bool, optional Should ppx check the remote repository for updated metadata? timeout : float, optional The maximum amount of time to wait for a server response. Attributes ---------- id : str local : Path object url : str title : str description : str metadata : dict fetch : bool timeout : float """ _api = "https://gnps-datasetcache.ucsd.edu/datasette/database/filename.csv" _proxy_api = "https://massive.ucsd.edu/ProteoSAFe/proxi/v0.1/datasets/" def __init__(self, msv_id, local=None, fetch=False, timeout=10.0): """Instantiate a MSVDataset object""" super().__init__(msv_id, local, fetch, timeout) self._params = { "_stream": "on", "_sort": "filepath", "dataset__exact": self.id, "_size": "max", } def _validate_id(self, identifier): """Validate a MassIVE identifier. Parameters ---------- identifier : str The project identifier to validate. Returns ------- str The validated identifier. """ identifier = str(identifier).upper() if not re.match("(MSV|RMSV)[0-9]{9}", identifier): raise ValueError("Malformed MassIVE identifier.") return identifier @property def url(self): """The FTP URL of the dataset.""" if self._url is not None: return self._url res = requests.get(self._proxy_api + self.id, timeout=self.timeout) for link in res.json()["datasetLink"]: if link["accession"] == "MS:1002852": self._url = link["value"] return self._url raise ValueError(f"No FTP link was found for {self.id}") @property def metadata(self): """The project metadata as a dictionary.""" if self._metadata is None: remote_file = "ccms_parameters/params.xml" metadata_file = self.local / remote_file try: # Only fetch file if it doesn't exist and self.fetch is true: if metadata_file.exists(): assert self.fetch # Fetch the data from the remote repository: self.download(remote_file, force_=True, silent=True) except (AssertionError, socket.gaierror) as err: if not metadata_file.exists(): raise err # Parse the XML root = ET.parse(metadata_file).getroot() self._metadata = {e.attrib["name"]: e.text for e in root} return self._metadata @property def title(self): """The title of this project.""" return self.metadata["desc"] @property def description(self): """A description of this project.""" return self.metadata["dataset.comments"]
[docs] def remote_files(self, glob=None): """List the project files in the remote repository. Parameters ---------- glob : str, optional Use Unix wildcards to return specific files. For example, :code:`"*.mzML"` would return all of the mzML files. Returns ------- list of str The remote files available for this project. """ if self.fetch or self._remote_files is None: try: info = self.file_info().splitlines()[1:] self._remote_files = [ r.split(",")[0].split("/", 1)[1] for r in info ] assert self._remote_files except ( TimeoutError, ConnectionRefusedError, ConnectionResetError, socket.gaierror, socket.herror, EOFError, OSError, AssertionError, ): LOGGER.debug("Scraping the FTP server for files...") self._remote_files = self._parser.files if glob is not None: files = [f for f in self._remote_files if Path(f).match(glob)] else: files = self._remote_files return files
[docs] def file_info(self): """Retrieve information about the project files. Returns ------- str Information about the files in a CSV format. """ file_info_path = self.local / ".file_info.csv" if file_info_path.exists() and not self.fetch: with file_info_path.open("r") as ref: return ref.read() res = requests.get( self._api, params=self._params, timeout=self.timeout, ) if res.status_code != 200: raise requests.HTTPError(f"Error {res.status_code}: {res.text}") with file_info_path.open("w+") as ref: ref.write(res.text) return res.text
[docs] def list_projects(timeout=10.0): """List all available projects on MassIVE. MassIVE: `<https://massive.ucsd.edu>`_ Parameters ---------- timeout : float, optional The maximum amount of time to wait for a response from the server. Returns ------- list of str A list of MassIVE identifiers. """ url = "https://gnps-datasetcache.ucsd.edu/datasette/database.csv" params = {"sql": "select distinct dataset from filename", "_size": "max"} try: res = requests.get(url, params, timeout=timeout).text.splitlines()[1:] res.sort() return res except ( TimeoutError, ConnectionRefusedError, ConnectionResetError, socket.gaierror, socket.herror, EOFError, OSError, ): LOGGER.debug("Scraping the FTP server for projects...") parser = FTPParser("ftp://massive.ucsd.edu/", max_depth=1, timeout=timeout) return [d.split("/")[1] for d in parser.dirs if "/" in d]