"""MassIVE datasets."""
import logging
import re
import socket
import xml.etree.ElementTree as ET # noqa: N817
from pathlib import Path
import requests
from .ftp import FTPParser
from .project import BaseProject
LOGGER = logging.getLogger(__name__)
[docs]
class MassiveProject(BaseProject):
"""Retrieve information about a MassIVE project.
MassIVE: `<https://massive.ucsd.edu>`_
Parameters
----------
msv_id : str
The MassIVE identifier.
local : str, pathlib.Path, or cloudpathlib.CloudPath, optional
The local data directory in which the project files will be
downloaded. In addition to local paths, paths to AWS S3,
Google Cloud Storage, or Azure Blob Storage can be used.
fetch : bool, optional
Should ppx check the remote repository for updated metadata?
timeout : float, optional
The maximum amount of time to wait for a server response.
Attributes
----------
id : str
local : Path object
url : str
title : str
description : str
metadata : dict
fetch : bool
timeout : float
"""
_api = "https://gnps-datasetcache.ucsd.edu/datasette/database/filename.csv"
_proxy_api = "https://massive.ucsd.edu/ProteoSAFe/proxi/v0.1/datasets/"
def __init__(self, msv_id, local=None, fetch=False, timeout=10.0):
"""Instantiate a MSVDataset object"""
super().__init__(msv_id, local, fetch, timeout)
self._params = {
"_stream": "on",
"_sort": "filepath",
"dataset__exact": self.id,
"_size": "max",
}
def _validate_id(self, identifier):
"""Validate a MassIVE identifier.
Parameters
----------
identifier : str
The project identifier to validate.
Returns
-------
str
The validated identifier.
"""
identifier = str(identifier).upper()
if not re.match("(MSV|RMSV)[0-9]{9}", identifier):
raise ValueError("Malformed MassIVE identifier.")
return identifier
@property
def url(self):
"""The FTP URL of the dataset."""
if self._url is not None:
return self._url
res = requests.get(self._proxy_api + self.id, timeout=self.timeout)
for link in res.json()["datasetLink"]:
if link["accession"] == "MS:1002852":
self._url = link["value"]
return self._url
raise ValueError(f"No FTP link was found for {self.id}")
@property
def metadata(self):
"""The project metadata as a dictionary."""
if self._metadata is None:
remote_file = "ccms_parameters/params.xml"
metadata_file = self.local / remote_file
try:
# Only fetch file if it doesn't exist and self.fetch is true:
if metadata_file.exists():
assert self.fetch
# Fetch the data from the remote repository:
self.download(remote_file, force_=True, silent=True)
except (AssertionError, socket.gaierror) as err:
if not metadata_file.exists():
raise err
# Parse the XML
root = ET.parse(metadata_file).getroot()
self._metadata = {e.attrib["name"]: e.text for e in root}
return self._metadata
@property
def title(self):
"""The title of this project."""
return self.metadata["desc"]
@property
def description(self):
"""A description of this project."""
return self.metadata["dataset.comments"]
[docs]
def remote_files(self, glob=None):
"""List the project files in the remote repository.
Parameters
----------
glob : str, optional
Use Unix wildcards to return specific files. For example,
:code:`"*.mzML"` would return all of the mzML files.
Returns
-------
list of str
The remote files available for this project.
"""
if self.fetch or self._remote_files is None:
try:
info = self.file_info().splitlines()[1:]
self._remote_files = [
r.split(",")[0].split("/", 1)[1] for r in info
]
assert self._remote_files
except (
TimeoutError,
ConnectionRefusedError,
ConnectionResetError,
socket.gaierror,
socket.herror,
EOFError,
OSError,
AssertionError,
):
LOGGER.debug("Scraping the FTP server for files...")
self._remote_files = self._parser.files
if glob is not None:
files = [f for f in self._remote_files if Path(f).match(glob)]
else:
files = self._remote_files
return files
[docs]
def file_info(self):
"""Retrieve information about the project files.
Returns
-------
str
Information about the files in a CSV format.
"""
file_info_path = self.local / ".file_info.csv"
if file_info_path.exists() and not self.fetch:
with file_info_path.open("r") as ref:
return ref.read()
res = requests.get(
self._api,
params=self._params,
timeout=self.timeout,
)
if res.status_code != 200:
raise requests.HTTPError(f"Error {res.status_code}: {res.text}")
with file_info_path.open("w+") as ref:
ref.write(res.text)
return res.text
[docs]
def list_projects(timeout=10.0):
"""List all available projects on MassIVE.
MassIVE: `<https://massive.ucsd.edu>`_
Parameters
----------
timeout : float, optional
The maximum amount of time to wait for a response from the server.
Returns
-------
list of str
A list of MassIVE identifiers.
"""
url = "https://gnps-datasetcache.ucsd.edu/datasette/database.csv"
params = {"sql": "select distinct dataset from filename", "_size": "max"}
try:
res = requests.get(url, params, timeout=timeout).text.splitlines()[1:]
res.sort()
return res
except (
TimeoutError,
ConnectionRefusedError,
ConnectionResetError,
socket.gaierror,
socket.herror,
EOFError,
OSError,
):
LOGGER.debug("Scraping the FTP server for projects...")
parser = FTPParser("ftp://massive.ucsd.edu/", max_depth=1, timeout=timeout)
return [d.split("/")[1] for d in parser.dirs if "/" in d]