Facebook
TwitterThis dataset has BioSample (SAMN accession id) submitter and date information. This information has been gathered from NCBI. Only BioSample accession ids mapped by EUPMC by 04.09.2025 have been included. If you want to include accession ids from newer mappings, you need to redownload the biosample.csv from https://europepmc.org/pub/databases/pmc/TextMinedTerms/
Here is how to recreate the biosample_info.parquet dataset:
import pandas as pd
from Bio import Entrez
import time
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor
import requests
from urllib.parse import urlencode
import os
from dotenv import load_dotenv
import joblib
# Load environment variables from .env file
load_dotenv()
# Load the biosample IDs from joblib file, these are BioSample mapped accession_ids from EuropePMC downloaded at 04.09.2025.
# If you want newer data, you need to download the EUPMC biosample.csv file again and extract unique biosample IDs like this:
# df_biosample = pd.read_csv("biosample.csv") # Edit the path if needed
# biosample_unique_in_eupmc = df_biosample.biosample.unique()
biosample_unique_in_eupmc = joblib.load('biosample_unique_in_eupmc.joblib') # Edit path if needed or remove if you redownloaded the biosample.csv
# Set email (required by NCBI) and API key for higher rate limits
Entrez.email = "your_email" # Make sure to add your e-mail here
Entrez.api_key = os.getenv('NCBI_API_KEY') # Make sure to add this to your .env file, you can get it with you sign up in NCBI, it's free
if Entrez.api_key:
print("✓ NCBI API key loaded successfully - using 10 requests/second limit")
else:
print("⚠ No NCBI API key found - using 3 requests/second limit")
def process_batch(batch_ids):
"""Process a batch of biosample IDs with NCBI compliance"""
try:
id_list = ','.join(batch_ids)
# Use direct URL approach with proper NCBI compliance
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
'db': 'biosample',
'id': id_list,
'retmode': 'xml',
'email': Entrez.email,
'tool': 'python_script' # Identify your tool
}
# Add API key if available
if hasattr(Entrez, 'api_key') and Entrez.api_key:
params['api_key'] = Entrez.api_key
response = requests.get(base_url, params=params, timeout=30)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
batch_results = {}
for biosample in root.findall('.//BioSample'):
acc = biosample.get('accession')
# Simplified extraction
submitter_parts = []
# Organization
owner = biosample.find('.//Owner/Name')
if owner is not None and owner.text:
submitter_parts.append(owner.text)
# Contact person (simplified)
contact = biosample.find('.//Owner/Contacts/Contact')
if contact is not None:
first = contact.find('.//Name/First')
last = contact.find('.//Name/Last')
if first is not None and last is not None and first.text and last.text:
submitter_parts.append(f"{first.text} {last.text}")
submitter = ", ".join(submitter_parts) if submitter_parts else "Unknown"
# Date (simplified)
date = biosample.get('submission_date', biosample.get('publication_date', 'Unknown'))
if date != 'Unknown':
date = date.split('T')[0]
if acc:
batch_results[acc] = f"{submitter}; {date}"
return batch_results
except Exception as e:
print(f"Error in batch: {e}")
return {}
# For faster processing with all IDs
def process_all_biosamples_fast(biosample_ids, batch_size=200, max_workers=1):
"""Process all biosamples with NCBI-compliant rate limiting"""
print(f"Processing {len(biosample_ids)} biosamples...")
print("Following NCBI guidelines: max 3 requests/second (or 10 with API key)")
# Create batches (NCBI recommends 200-500 IDs per batch)
batches = [biosample_ids[i:i+batch_size] for i in range(0, len(biosample_ids), batch_size)]
all_results = {}
processed = 0
# Calculate delay based on whether API key is available
has_api_key = hasattr(Entrez, 'api_key') and Entrez.api_key
delay = 0.1 if has_api_key else 0.34 # 10/sec with key, 3/sec without
print(f"Using {delay}s delay between requests ({'with' if has_api_key else 'without'} API key)")
# Process batches sequentially to respect rate limits
for i, batch in enumerate(batches):
print(f...
Facebook
TwitterMIT Licensehttps://opensource.org/licenses/MIT
License information was derived automatically
🧬 PMC Corpus Harvester
Build a disease-specific literature corpus in one evening! Gene-agnostic PubMed Central corpus builder for rare disease research. Downloads HTML + metadata from PMC for building RAG (Retrieval-Augmented Generation) systems. Originally developed for STXBP1-ARIA and SNAP25-ARIA projects.
✨ Features
Feature Description
Gene-Agnostic Configure for any gene or disease with YAML/JSON
NCBI API Support 10 req/sec with API key (vs 3/sec… See the full description on the dataset page: https://huggingface.co/datasets/SkyWhal3/PMC-Corpus-Harvester.
Facebook
TwitterThis dataset was collected using the esearch NCBI API. It was processed by selecting relevant fields (title, authors and abstract) and inserted into a large text file (.txt) using a custom Bash script.
Facebook
Twitterhttps://www.bco-dmo.org/dataset/747872/licensehttps://www.bco-dmo.org/dataset/747872/license
This dataset includes metadata associated with NCBI BioProject PRJNA377729 \Impacts of Evolution on the Response of Phytoplankton Populations to Rising CO2\ PRJNA377729: https://www.ncbi.nlm.nih.gov/bioproject/PRJNA377729. The alga Heterosigma akashiwo was grown at CO2 levels from about 200 to 1000 ppm and then the DNA and RNA were sequenced. access_formats=.htmlTable,.csv,.json,.mat,.nc,.tsv acquisition_description=Uni-algal, non-axenic cultures of Heterosigma akashiwo (CCMP2393) were grown in L1 medium (without silicate) made with a Long Island Sound seawater base collected from Avery Point, CT, USA (salinity 32) at 18\u00b0C with a 14:10 (light:dark) cycle with an irradiance of approximately 100 \u00b5mol m-2 s-1 . Cells were acclimated in exponential growth phase to different carbonate chemistries in 1.2 L of L1 media in 2.5-L polycarbonate bottles. To control the carbonate chemistry of the water, the headspace of each bottle was purged continuously with a custom gas mixture of ~21% oxygen, ~79% nitrogen and either 200, 400, 600, 800 or 1000 ppmv CO2 (TechAir, NY).
At the point of harvest, 150 mL (~6 x 106 cells) were filtered on to 5 \u00b5m pore size, 25 mm polycarbonate filter and flash frozen in liquid nitrogen. Genetic material from samples was extracted with the RNeasy Mini kit (Qiagen, Valencia, CA) and DNA was removed on-column using the RNase-free DNase Set (Qiagen), yielding total RNA. Total RNA extracts of the triplicate cultures were quantified on a 2100 Bioanalyzer (Agilent, Santa Clara, CA). Libraries were prepared using poly-A pull down with the TruSeq Stranded mRNA Library Prep kit (Illumina, San Diego, CA). Library preparation, barcoding, and sequencing from each library was performed by the JP Sulzberger Columbia University Genome Center (New York, NY).
Sequence reads were de-multiplexed and trimmed to remove sequencing barcodes. Reads were aligned using Bowtie2 (Langmead and Salzberg 2012) to the MMETSP consensus contigs for Heterosigma akashiwo CCMP2393 ("%5C%22https://omictools.com/marine-microbial-eukaryotic-%0Atranscriptome-sequencing-project-tool%5C%22">https://omictools.com /marine-microbial-eukaryotic-transcriptome- sequenci...).
Significant differences between physiological parameters by CO2 treatment were assessed with analysis of variance (ANOVA) and Tukey\u2019s honestly significant differences test (aov and TukeyHSD, stats, R). Differential expression of genes in any CO2 treatment compared to modern was determined using the general linear model (GLM) exact test (edgeR, R). Briefly, the read counts were normalized by trimmed mean of M-values (TMM) using the function calcNormFactors, tagwise dispersions were calculated with the function estimateGLMTagwiseDisp, a GLM was fit using glmFit, and log2 fold change (logFC) for each treatment was calculated relative to average expression at modern CO2. P-values from likelihood ratio tests were corrected for multiple testing using the false discovery method (fdr). awards_0_award_nid=55197 awards_0_award_number=OCE-1314336 awards_0_data_url=http://www.nsf.gov/awardsearch/showAward?AWD_ID=1314336 awards_0_funder_name=NSF Division of Ocean Sciences awards_0_funding_acronym=NSF OCE awards_0_funding_source_nid=355 awards_0_program_manager=David L. Garrison awards_0_program_manager_nid=50534 cdm_data_type=Other comment=Hak_acclim The harmful alga Heterosigma akashiwo (CCMP2393) grown under a range of CO2 concentrations from 200-1000 ppm. PI's: S. Dyhrman (LDEO), J. Morris (U Alabama) version: 2018-10-11 See also: https://www.ncbi.nlm.nih.gov/bioproject/377729 Conventions=COARDS, CF-1.6, ACDD-1.3 data_source=extract_data_as_tsv version 2.3 19 Dec 2019 defaultDataQuery=&time<now doi=10.1575/1912/bco-dmo.747872.1 geospatial_vertical_positive=down geospatial_vertical_units=m infoUrl=https://www.bco-dmo.org/dataset/747872 institution=BCO-DMO instruments_0_acronym=Automated Sequencer instruments_0_dataset_instrument_description=Used to prepare the mRNA libraries. Samples were barcoded for multiplex sequencing and run on in a single lane by the Columbia University Genome Center (CUGC) (New York, NY). instruments_0_dataset_instrument_nid=747879 instruments_0_description=General term for a laboratory instrument used for deciphering the order of bases in a strand of DNA. Sanger sequencers detect fluorescence from different dyes that are used to identify the A, C, G, and T extension reactions. Contemporary or Pyrosequencer methods are based on detecting the activity of DNA polymerase (a DNA synthesizing enzyme) with another chemoluminescent enzyme. Essentially, the method allows sequencing of a single strand of DNA by synthesizing the complementary strand along it, one base pair at a time, and detecting which base was actually added at each step. instruments_0_instrument_name=Automated DNA Sequencer instruments_0_instrument_nid=649 instruments_0_supplied_name=Illumina Hi-seq 2500 paired-end sequencing (PE100) with TruSeq RNA sample Prep Kit (Illumina, San Diego, CA) keywords_vocabulary=GCMD Science Keywords metadata_source=https://www.bco-dmo.org/api/dataset/747872 param_mapping={'747872': {'collection_date': 'flag - time', 'depth': 'master - depth'}} parameter_source=https://www.bco-dmo.org/mapserver/dataset/747872/parameters people_0_affiliation=Lamont-Doherty Earth Observatory people_0_affiliation_acronym=LDEO people_0_person_name=Sonya T. Dyhrman people_0_person_nid=51101 people_0_role=Principal Investigator people_0_role_type=originator people_1_affiliation=University of Alabama at Birmingham people_1_affiliation_acronym=UA/Birmingham people_1_person_name=James Jeffrey Morris people_1_person_nid=51678 people_1_role=Co-Principal Investigator people_1_role_type=originator people_2_affiliation=Lamont-Doherty Earth Observatory people_2_affiliation_acronym=LDEO people_2_person_name=Gwenn Hennon people_2_person_nid=546456 people_2_role=Scientist people_2_role_type=originator people_3_affiliation=Woods Hole Oceanographic Institution people_3_affiliation_acronym=WHOI BCO-DMO people_3_person_name=Nancy Copley people_3_person_nid=50396 people_3_role=BCO-DMO Data Manager people_3_role_type=related project=P-ExpEv projects_0_acronym=P-ExpEv projects_0_description=Note: This project is also affiliated with the NSF BEACON Center for the Study of Evolution in Action. Project Description from NSF Award: Human activities are driving up atmospheric carbon dioxide concentrations at an unprecedented rate, perturbing the ocean's carbonate buffering system, lowering oceanic pH, and changing the concentration and composition of dissolved inorganic carbon. Recent studies have shown that this ocean acidification has many short-term effects on phytoplankton, including changes in carbon fixation among others. These physiological changes could have profound effects on phytoplankton metabolism and community structure, with concomitant effects on Earth's carbon cycle and, hence, global climate. However, extrapolation of present understanding to the field are complicated by the possibility that natural populations might evolve in response to their changing environments, leading to different outcomes than those predicted from short-term studies. Indeed, evolution experiments demonstrate that microbes are often able to rapidly adapt to changes in the environment, and that beneficial mutations are capable of sweeping large populations on time scales relevant to predictions of environmental dynamics in the coming decades. This project addresses two major areas of uncertainty for phytoplankton populations with the following questions: 1) What adaptive mutations to elevated CO2 are easily accessible to extant species, how often do they arise, and how large are their effects on fitness? 2) How will physical and ecological interactions affect the expansion of those mutations into standing populations? This study will address these questions by coupling experimental evolution with computational modeling of ocean biogeochemical cycles. First, cultured unicellular phytoplankton, representative of major functional groups (e.g. cyanobacteria, diatoms, coccolithophores), will be evolved under simulated year 2100 CO2 concentrations. From these experiments, estimates will be made of a) the rate of beneficial mutations, b) the magnitude of fitness gains conferred by these mutations, and c) secondary phenotypes (i.e., trade-offs) associated with these mutations, assayed using both physiological and genetic approaches. Second, an existing numerical model of the global ocean system will be modified to a) simulate the effects of changing atmospheric CO2 concentrations on ocean chemistry, and b) allow the introduction of CO2-specific adaptive mutants into the extant populations of virtual phytoplankton. The model will be used to explore the ecological and biogeochemical impacts of beneficial mutations in realistic environmental situations (e.g. resource availability, predation, etc.). Initially, the model will be applied to idealized sensitivity studies; then, as experimental results become available, the implications of the specific beneficial mutations observed in our experiments will be explored. This interdisciplinary study will provide novel, transformative understanding of the extent to which evolutionary processes influence phytoplankton diversity, physiological ecology, and carbon cycling in the near-future ocean. One of many important outcomes will be the development and testing of nearly-neutral genetic markers useful for competition studies in major phytoplankton functional groups, which has applications well beyond the current proposal. projects_0_end_date=2017-05 projects_0_geolocation=Experiment housed in laboratories at Michigan State University projects_0_name=Impacts of Evolution on the Response of Phytoplankton Populations to Rising CO2 projects_0_project_nid=2276 projects_0_start_date=2013-06 sourceUrl=(local
Facebook
Twitter
=== Genome sequences ===
These are the different genome references (fasta formats) available for:
The two genome assemblies of S. habrochaites LA1777 and PI127826 were obtained through a combination of 10X Linked-Reads and BioNano Optical Mapping. This sequencing has been funded by the DTL Technology Hotel 2018 funding scheme.
=== Transcriptomes and proteomes ===
=== Genome annotations files ===
Solanum lycopersicum
Solanum lycopersicoides:
Solanum habrochaites
Reference:
Tomato Genome Sequencing Consortium. 2012. The tomato genome sequence provides insights into fleshy fruit evolution. Nature volume 485, pages 635–641.
Bolger et al. 2014. The genome of the stress-tolerant wild tomato species Solanum pennellii http://www.nature.com/ng/journal/v46/n9/full/ng.3046.html
Hosmani et al. 2019. An improved de novo assembly and annotation of the tomato reference genome using single-molecule sequencing, Hi-C proximity ligation and optical maps. https://www.biorxiv.org/content/10.1101/767764v1
Aflitos et al. 2014. Exploring genetic variation in the tomato (Solanum section Lycopersicon) clade by whole‐genome sequencing. https://onlinelibrary.wiley.com/doi/full/10.1111/tpj.12616
Stam et al. 2019. The de Novo Reference Genome and Transcriptome Assemblies of the Wild Tomato Species Solanum chilense Highlights Birth and Death of NLR Genes Between Tomato Species. G3: Genes, Genomes, Genetics December 1, 2019 vol. 9 no. 12 3933-3941; https://doi.org/10.1534/g3.119.400529
Facebook
TwitterPubMed-IV Dataset
The PubMed-IV dataset is derived from PubMed abstracts and metadata, collected using the NCBI E-utilities API. It includes structured fields such as title, abstract text (including structured sections like Conclusions when available), authors, journal metadata, and identifiers (PMID, DOI, etc.). No full-text articles are included. Data from PubMed, a service of the U.S. National Library of Medicine (NLM). PubMed data is in the public domain. NLM does not endorse… See the full description on the dataset page: https://huggingface.co/datasets/KevinZonda/PubMed-IV.
Facebook
TwitterLitVar API, a new service provided by NCBI, to retrieve relevant literature of a given variant.
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
This submission presents metagenomic sequencing identifications and mass-spectrometry datasets for the following publication:
Brinkmann, B.W.; Zhiling, G.; Vijver, M.G..; Peijnenburg, W.J.G.M.; Chetwynd, A.J. Host and microbiome proteins in eco-coronas: abundance, physicochemical properties and binding partners. Environ. Sci.: Nano. 2025, DOI: 10.1039/d5en00493d.
1. Metagenomic sequencing identifications
Tab-delimited text files with genus-level read abundances identified in whole-body metagenomes of:
The data of both files were generated using the Pavian webtool (https://fbreitwieser.shinyapps.io/pavian/) accessed on 25 October 2024. Columns with read abudances present results for 3 biological replicates and an extraction kit metagenome (blank). The TaxId column presents the Taxonomy Identifier of NCBI Taxonomy Browser. The associated metagenomic data are deposited in the NCBI Sequence Read Archive under BioProject ID: PRJNA1336773 (http://www.ncbi.nlm.nih.gov/bioproject/1336773).
2. Proteomic datasets
Tab-delimited mass-spectrometry datasets and metadata obtained for experiments with:
Sample names in the mass spectrometry datasets consist of the following three elements: {sample type}_{microbiome condition}_{replicate}
where:
Accession numbers were obtained from the UniProt KB protein knowledgebase.
Column names for metadata present:
Facebook
TwitterU.S. Government Workshttps://www.usa.gov/government-works
License information was derived automatically
Honey bees (Apis mellifera), a critical agricultural pollinator in many areas, have a high rate of infection with a large DNA virus, Apis mellifera filamentous virus (AmFV), yet little is known about its ecology or impact on honey bee colonies, other than its ubiquity and apparent low virulence. This study scanned over 5,000 public data sets to detect AmFV sequences in honey bees as well as a parasitic mite of honey bees, Varroa destructor, that is a potential vector of AmFV. The data release consists of these files: 1. AmFV.genome.assemblies.aligned.fas, which contains new AmFV draft genome sequences generated by this study aligned with existing reference genome accessions downloaded from the National Center for Biotechnology Information (NCBI). 2. kmer.list.txt, a list of kmers that were extracted from reference sequences and searched for in Sequence Read Archive (SRA) accessions. 3. sample.metadata.txt, which lists all accessions of the SRA, and NCBI database of high-throughp ...
Facebook
TwitterMIT Licensehttps://opensource.org/licenses/MIT
License information was derived automatically
The above dataset has been created by extracting Research Academic Papers from (https://pubmed.ncbi.nlm.nih.gov) with the Official Pubmed API. It has been created solely for educational purpose. The Creator bears no intent of malpractice of this dataset.
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
Description
This repository contains a comprehensive dataset focused on Parkinson's Disease. We provide data extracted via web scraping, along with metadata resulting from the extraction process using the NCBI API. The data pertains to the article titled 'A bibliometric study on Parkinson's Disease based on the open access data of the Michael J. Fox Foundation'.
Metadata Description
Analisys_MJFF_05_04_2024.xlsx
Field Description Data Type
AU List of authors in abbreviated format. Text
AF List of authors with full names. Text
TI Full title of the article. Text
SO Name of the journal or publication. Text
SO_CO Country of origin of the publication. Text
LA Language of the article. Text
DT Type of document, such as "Journal Article". Text
DE Keywords or descriptors associated with the article. Text
MESH MeSH terms that describe the content of the article. Text
DI Digital Object Identifier (DOI). Text
PG Number of pages or page range. Numeric
GRANT_ID Identification of funding, when available. Text
GRANT_ORG Organization that provided the funding. Text
UT, PMID Unique identifiers of the article. Numeric
DB Name of the database where the article is indexed. Text
AU_UN Information about the academic unit or institution of the authors. Text
References_MJFF_v2_Final_Corrected.csv
Field Description Data Type
Title Name of the article or publication. Text
Authors List of authors who contributed to the article. Text
Journal Name Name of the journal or periodical where the article was published. Text
Publisher Name of the publisher who published the article. Text
Volume Volume number of the journal in which the article appears. Numeric or Text
Edition Number Number of the edition of the journal in which the article is found. Numeric or Text
Starting Page Number of the first page of the article in the publication. Numeric
Ending Page Number of the last page of the article. Numeric
Publication Date Date on which the article was published. Date
Open Access Status Indicates whether the article is available in open access. Text
License Type of license under which the article was published. Text
DOI (Digital Object Identifier) Unique identifier for the article that provides a permanent link to the online access. Text
OA Location URL Direct URL to the article, if available in open access. Text
Citation Count Number of times the article has been cited by other publications. Numeric
Facebook
Twitterhttps://spdx.org/licenses/CC0-1.0.htmlhttps://spdx.org/licenses/CC0-1.0.html
Automatically identifying chemical and drug names in scientific publications advances information access for this important class of entities in a variety of biomedical disciplines by enabling improved retrieval and linkage to related concepts. While current methods for tagging chemical entities were developed for the article title and abstract, their performance in the full article text is substantially lower. However, the full text frequently contains more detailed chemical information, such as the properties of chemical compounds, their biological effects, and interactions with diseases, genes, and other chemicals.
We, therefore, present the NLM-Chem corpus, a full-text resource to support the development and evaluation of automated chemical entity taggers. The NLM-Chem corpus consists of 150 full-text articles, doubly annotated by ten expert NLM indexers, with ~5000 unique chemical name annotations, mapped to ~2000 MeSH identifiers. Using this corpus, we built a substantially improved chemical entity tagger, with automated annotations for all of PubMed and PMC freely accessible through the PubTator web-based interface and API.
Methods NLM-Chem corpus consists of 150 full-text articles from the PubMed Central Open Access dataset, comprising 67 different chemical journals, aiming to cover a general distribution of usage of chemical names in the biomedical literature. Articles were selected so that human annotation was most valuable (meaning that they were rich in bio-entities, and current state-of-the-art named entity recognition systems disagreed on bio-entity recognition.
Ten indexing experts at the National Library of Medicine manually annotated the corpus using the TeamTat annotation system that allows swift annotation project management. The corpus was annotated in three batches and each batch of articles was annotated in three annotation rounds. Annotators were randomly paired for each article, and pairings were randomly shuffled for each subsequent batch. In this manner, the workload was distributed fairly. To control for bias, annotator identities were hidden the first two annotation rounds. In the final annotation rounds, annotators worked collaboratively to resolve the final few annotation disagreements and reach a 100% consensus.
The full-text articles were fully annotated for all chemical name occurrences in text, and the chemicals were mapped to Medical Subject Heading (MeSH) entries to facilitate indexing and other downstream article processing tasks at the National Library of Medicine. MeSH is part of the UMLS and as such, chemical entities can be mapped to other standard vocabularies.
The data has been evaluated for high annotation quality, and its use as training data has already improved chemical named entity recognition in PubMed. The newly improved system has already been incorporated in the PubTator API tools (https://www.ncbi.nlm.nih.gov/research/pubtator/api.html).
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
NCBIfam is a collection of protein families, featuring curated multiple sequence alignments, hidden Markov models (HMMs) and annotation, which provides a tool for identifying functionally related proteins based on sequence homology. NCBIfam is maintained at the National Center for Biotechnology Information (Bethesda, MD). NCBIfam includes models from TIGRFAMs, another database of protein families developed at The Institute for Genomic Research, then at the J. Craig Venter Institute (Rockville, MD, US).
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
We analysed the field of expression profiling by high throughput sequencing, or HT-seq, in terms of replicability and reproducibility, using data from the NCBI GEO (Gene Expression Omnibus) repository.
- This release includes GEO series published up to Dec-31, 2020;
geo-htseq.tar.gz archive contains following files:
- output/parsed_suppfiles.csv, p-value histograms, histogram classes, estimated number of true null hypotheses (pi0).
- output/document_summaries.csv, document summaries of NCBI GEO series.
- output/suppfilenames.txt, list of all supplementary file names of NCBI GEO submissions.
- output/suppfilenames_filtered.txt, list of supplementary file names used for downloading files from NCBI GEO.
- output/publications.csv, publication info of NCBI GEO series.
- output/scopus_citedbycount.csv, Scopus citation info of NCBI GEO series
- output/spots.csv, NCBI SRA sequencing run metadata.
- output/cancer.csv, cancer related experiment accessions.
- output/transcription_factor.csv, TF related experiment accessions.
- output/single-cell.csv, single cell experiment accessions.
- blacklist.txt, list of supplementary files that were either too large to import or were causing computing environment crash during import.
Workflow to produce this dataset is available on Github at rstats-tartu/geo-htseq.
geo-htseq-updates.tar.gz archive contains files:
- results/detools_from_pmc.csv, differential expression analysis programs inferred from published articles
- results/n_data.csv, manually curated sample size info for NCBI GEO HT-seq series
- results/simres_df_parsed.csv, pi0 values estimated from differential expression results obtained from simulated RNA-seq data
- results/data/parsed_suppfiles_rerun.csv, pi0 values estimated using smoother method from anti-conservative p-value sets
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
Next Generation Sequencing (NGS) analysis of Cell-Free DNA provides valuable insights into a spectrum of pathogenic species (particularly bacterial) in blood. Patients with Sepsis often face problems like delays in treatment regimens (combination or cocktail of antibiotics) due to the long turnaround time (TAT) of classical and standard blood culture procedures. NGS gives results with lower TAT along with high-depth coverage. The use of NGS may be a possible solution to deciding treatment regimens for patients without losing precious time and more accurately possibly saving lives.
Our curated dataset is of bacterial species or strains detected along with their genome size in 107 AML patients diagnosed with Sepsis clinically. Cell-free DNA profiles of patients were built and sequencing was done in Illumina (NovaSeq and NextSeq). Bioinformatic analysis was performed using two classification algorithms namely kraken2 and kaiju. For kraken2 based classification reference bacterial index developed by Carlo Ferravante et al (Zenodo 2020) (link: https://zenodo.org/records/4055180) was used, while for kaiju-based classification reference database named "nr_euk" dated "2023-05-10" (link: https://bioinformatics-centre.github.io/kaiju/downloads.html) was used.
Genome size annotation is important in metagenomics since for the use of depth of coverage (abundance), genome size is required. In metagenomic classification algorithms like kraken/kraken2 and kaiju output computes reads assigned only and not abundance. In kaiju, the problem is more complicated since the reference database does not have a fasta file but only an index file from which alignment is done.
To address the above challenges to compute "depth of coverage" or simply abundance, we build a Genome size annotator tool (https://github.com/patkarlab/Genome-Size-Annotation) which provides genome size for each species detected given its taxid is available. In this tool, the NCBI Datasets tool, NCBI Genome API check tool, and Data Mining from AI search engines like perplexity.ai are used.
We have curated two datasets
Kraken2 dataset named "FINAL METAGENOMIC DATA MASTERSHEET - kraken_genome_annotation"Kaiju dataset named "FINAL METAGENOMIC DATA MASTERSHEET - kaiju_genome_annotation"
*Please note that for kraken2 curated dataset, we used data mining from the AI search engine perplexity.ai while for kaiju we did not use perplexity, ai, and any species whose genome size was not found was labeled "NA"
Facebook
Twitterhttps://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/
Automatically identifying chemical and drug names in scientific publications advances information access for this important class of entities in a variety of biomedical disciplines by enabling improved retrieval and linkage to related concepts. While current methods for tagging chemical entities were developed for the article title and abstract, their performance in the full article text is substantially lower. However, the full text frequently contains more detailed chemical information, such as the properties of chemical compounds, their biological effects, and interactions with diseases, genes, and other chemicals.
We, therefore, present the NLM-Chem corpus, a full-text resource to support the development and evaluation of automated chemical entity taggers. The NLM-Chem corpus consists of 150 full-text articles, doubly annotated by ten expert NLM indexers, with ~5000 unique chemical name annotations, mapped to ~2000 MeSH identifiers. Using this corpus, we built a substantially improved chemical entity tagger, with automated annotations for all of PubMed and PMC freely accessible through the PubTator web-based interface and API.
NLM-Chem corpus consists of 150 full-text articles from the PubMed Central Open Access dataset, comprising 67 different chemical journals, aiming to cover a general distribution of usage of chemical names in the biomedical literature. Articles were selected so that human annotation was most valuable (meaning that they were rich in bio-entities, and current state-of-the-art named entity recognition systems disagreed on bio-entity recognition.
Ten indexing experts at the National Library of Medicine manually annotated the corpus using the TeamTat annotation system that allows swift annotation project management. The corpus was annotated in three batches and each batch of articles was annotated in three annotation rounds. Annotators were randomly paired for each article, and pairings were randomly shuffled for each subsequent batch. In this manner, the workload was distributed fairly. To control for bias, annotator identities were hidden the first two annotation rounds. In the final annotation rounds, annotators worked collaboratively to resolve the final few annotation disagreements and reach a 100% consensus.
The full-text articles were fully annotated for all chemical name occurrences in text, and the chemicals were mapped to Medical Subject Heading (MeSH) entries to facilitate indexing and other downstream article processing tasks at the National Library of Medicine. MeSH is part of the UMLS and as such, chemical entities can be mapped to other standard vocabularies.
The data has been evaluated for high annotation quality, and its use as training data has already improved chemical named entity recognition in PubMed. The newly improved system has already been incorporated in the PubTator API tools (https://www.ncbi.nlm.nih.gov/research/pubtator/api.html).
Islamaj, Rezarta; Leaman, Robert; Lu, Zhiyong (2021), NLMChem a new resource for chemical entity recognition in PubMed full-text literature, Dryad, Dataset, https://doi.org/10.5061/dryad.3tx95x6dz
Facebook
Twitterhttps://www.bco-dmo.org/dataset/745518/licensehttps://www.bco-dmo.org/dataset/745518/license
Seawater was collected via Niskin bottles mounted with a CTD from the San Pedro Ocean Time-series (SPOT) station off the coast of Southern California near the surface (5 m), 150 and 890 m, in late May 2015. Raw sequence data was generated as part of a metatranscriptome study targeting the protistan community. Raw sequences are available at the National Center for Biotechnology Information (NCBI) Sequence Read Archive (SRA) database (SRA Study ID: SRP110974, BioProject: PRJNA391503). Sequences for BioProject PRJNA608423 will be available at NCBI on Jan 1st, 2021.\r \r These data were published in Hu et al. (2018). access_formats=.htmlTable,.csv,.json,.mat,.nc,.tsv acquisition_description=Seawater was collected from the San Pedro Ocean Time-series (SPOT) station off the coast of Southern California near the surface (5 m), 150 and 890 m, in late May 2015. Briefly, seawater was pre-filtered (80 mm) into 20 L carboys to minimize the presence of multicellular eukaryotes. Replicate samples (ranging in volume from 1.5-3.5 L) from each depth were filtered onto sterile GF/F filters (nominal pore size 0.7 mm, Whatman, International Ltd. Florham Park, NJ). While we cannot avoid some impact that sample handling (i.e., bringing samples to the surface) may have had on our results, filters were immediately placed in 1.5 mL of lysis buffer and flash frozen in liquid nitrogen in < 40 min and away from light to minimize RNA degradation.
Total RNA was extracted from each filter using a DNA/RNA AllPrep kit (Qiagen, Valencia, CA, #80204) with an in-line genomic DNA removal step (RNase-free DNase reagents, Qiagen #79254) (dx.doi.org/10.17504/protocols.io.hk3b4yn). Extracted RNA was quality checked and low biomass samples were pooled. Six replicates were processed and sequenced from the surface, while pairs of filters were pooled for either 150 or 890 m, yielding 3 and 4 replicates respectively (Supporting Information Table S1). RNA concentrations were normalized before library preparation (Supporting Information). ERCC spike-in was added before sequence library preparation with Kapa\u2019s Stranded mRNA library preparation kit using poly-A tail selection beads to select for eukaryotic mRNA (Kapa Biosystems, Inc., Wilmington, MA, #KK8420).
Also see:
"%5C%22https://www.protocols.io/view/sample-collection-from-the-field-%0Afor-downstream-mo-hisb4eehttps://www.protocols.io/view/rna-and-optional-dna-%0Aextraction-from-environmental-hk3b4yn%5C%22">https://www.protocols.io/view/sample-collection-from-the-field-for- downs...
The associated assembly files can be found at Zenodo (see Hu, S. K. (2017), DOI:\u00a010.5281/zenodo.1202041).\u00a0 The assembly files were also published in the journal publication Hu, et al. (2018).
Related code can be found in the github repository https://github.com/shu251/SPOT_metatranscriptome.\u00a0 The version of the code used for these publications can be found in the Supplemental Files section of this page. awards_0_award_nid=743048 awards_0_award_number=OCE-1737409 awards_0_data_url=http://www.nsf.gov/awardsearch/showAward.do?AwardNumber=1737409 awards_0_funder_name=NSF Division of Ocean Sciences awards_0_funding_acronym=NSF OCE awards_0_funding_source_nid=355 awards_0_program_manager=David L. Garrison awards_0_program_manager_nid=50534 cdm_data_type=Other comment=Microbial eukaryotic focused metatranscriptome data PI: David Caron Data version 2: 2020-02-26 Conventions=COARDS, CF-1.6, ACDD-1.3 data_source=extract_data_as_tsv version 2.3 19 Dec 2019 dataset_current_state=Final and no updates defaultDataQuery=&time<now doi=10.26008/1912/bco-dmo.745518.2 infoUrl=https://www.bco-dmo.org/dataset/745518 institution=BCO-DMO instruments_0_acronym=Niskin bottle instruments_0_dataset_instrument_nid=773599 instruments_0_description=A Niskin bottle (a next generation water sampler based on the Nansen bottle) is a cylindrical, non-metallic water collection device with stoppers at both ends. The bottles can be attached individually on a hydrowire or deployed in 12, 24, or 36 bottle Rosette systems mounted on a frame and combined with a CTD. Niskin bottles are used to collect discrete water samples for a range of measurements including pigments, nutrients, plankton, etc. instruments_0_instrument_external_identifier=https://vocab.nerc.ac.uk/collection/L22/current/TOOL0412/ instruments_0_instrument_name=Niskin bottle instruments_0_instrument_nid=413 instruments_1_acronym=Automated Sequencer instruments_1_dataset_instrument_description=HiSeq High Output 125 bp PE sequencing was performed at UPC Genome Core at University of Southern California, Los Angeles, CA (BioProject: PRJNA391503). instruments_1_dataset_instrument_nid=745534 instruments_1_description=General term for a laboratory instrument used for deciphering the order of bases in a strand of DNA. Sanger sequencers detect fluorescence from different dyes that are used to identify the A, C, G, and T extension reactions. Contemporary or Pyrosequencer methods are based on detecting the activity of DNA polymerase (a DNA synthesizing enzyme) with another chemoluminescent enzyme. Essentially, the method allows sequencing of a single strand of DNA by synthesizing the complementary strand along it, one base pair at a time, and detecting which base was actually added at each step. instruments_1_instrument_name=Automated DNA Sequencer instruments_1_instrument_nid=649 instruments_1_supplied_name=HiSeq metadata_source=https://www.bco-dmo.org/api/dataset/745518 param_mapping={'745518': {}} parameter_source=https://www.bco-dmo.org/mapserver/dataset/745518/parameters people_0_affiliation=University of Southern California people_0_affiliation_acronym=USC people_0_person_name=David Caron people_0_person_nid=50524 people_0_role=Principal Investigator people_0_role_type=originator people_1_affiliation=University of Southern California people_1_affiliation_acronym=USC people_1_person_name=Sarah K. Hu people_1_person_nid=745520 people_1_role=Co-Principal Investigator people_1_role_type=originator people_2_affiliation=University of Southern California people_2_affiliation_acronym=USC people_2_person_name=Sarah K. Hu people_2_person_nid=745520 people_2_role=Contact people_2_role_type=related people_3_affiliation=Woods Hole Oceanographic Institution people_3_affiliation_acronym=WHOI BCO-DMO people_3_person_name=Amber D. York people_3_person_nid=643627 people_3_role=BCO-DMO Data Manager people_3_role_type=related project=SPOT projects_0_acronym=SPOT projects_0_description=Planktonic marine microbial communities consist of a diverse collection of bacteria, archaea, viruses, protists (phytoplankton and protozoa) and small animals (metazoan). Collectively, these species are responsible for virtually all marine pelagic primary production where they form the basis of food webs and carry out a large fraction of respiratory processes. Microbial interactions include the traditional role of predation, but recent research recognizes the importance of parasitism, symbiosis and viral infection. Characterizing the response of pelagic microbial communities and processes to environmental influences is fundamental to understanding and modeling carbon flow and energy utilization in the ocean, but very few studies have attempted to study all of these assemblages in the same study. This project is comprised of long-term (monthly) and short-term (daily) sampling at the San Pedro Ocean Time-series (SPOT) site. Analysis of the resulting datasets investigates co-occurrence patterns of microbial taxa (e.g. protist-virus and protist-prokaryote interactions, both positive and negative) indicating which species consistently co-occur and potentially interact, followed by examination gene expression to help define the underlying mechanisms. This study augments 20 years of baseline studies of microbial abundance, diversity, rates at the site, and will enable detection of low-frequency changes in composition and potential ecological interactions among microbes, and their responses to changing environmental forcing factors. These responses have important consequences for higher trophic levels and ocean-atmosphere feedbacks. The broader impacts of this project include training graduate and undergraduate students, providing local high school student with summer lab experiences, and PI presentations at local K-12 schools, museums, aquaria and informal learning centers in the region. Additionally, the PIs advise at the local, county and state level regarding coastal marine water quality. This research project is unique in that it is a holistic study (including all microbes from viruses to small metazoa) of microbial species diversity and ecological activities, carried out at the SPOT site off the coast of southern California. In studying all microbes simultaneously, this work aims to identify important ecological interactions among microbial species, and identify the basis(es) for those interactions. This research involves (1) extensive analyses of prokaryote (archaean and bacterial) and eukaryote (protistan and micro-metazoan) diversity via the sequencing of marker genes, (2) studies of whole-community gene expression by eukaryotes and prokaryotes in order to identify key functional characteristics of microorganismal groups and the detection of active viral infections, and (3) metagenomic analysis of viruses and bacteria to aid interpretation of transcriptomic analyses using genome-encoded information. The project includes exploratory metatranscriptomic analysis of poorly-understood aphotic and hypoxic-zone protists, to examine their stratification, functions and hypothesized prokaryotic symbioses. projects_0_end_date=2021-07 projects_0_geolocation=San Pedro Channel off the coast of Los Angeles projects_0_name=Protistan, prokaryotic, and viral processes at the San Pedro Ocean
Facebook
Twitterhttps://www.bco-dmo.org/dataset/817436/licensehttps://www.bco-dmo.org/dataset/817436/license
To document the effects of storm-driven freshwater runoff on\u00a0sponge- associated microbiomes, we leveraged the heavy rainfall\u00a0associated with Tax Day Flooding (July 2016) and Hurricane Harvey\u00a0(August 2017) to characterize sponge-associated bacterial communities\u00a0at five time points: in July 2016 (at detection of the mortality\u00a0event), one month after the mortality event (August 2016), immediately\u00a0after Hurricane Harvey (September 2017), one month after Hurricane\u00a0Harvey (October 2017), and approximately one year following Hurricane\u00a0Harvey (October 2018).
These data contain Sequence Read Archive (SRA) and BioSample accession numbers associated with BioProject\u00a0PRJNA605902 (see\u00a0https://www.ncbi.nlm.nih.gov/bioproject/605902)\u00a0at The National Center for Biotechnology Information. access_formats=.htmlTable,.csv,.json,.mat,.nc,.tsv,.esriCsv,.geoJson acquisition_description=Location:
East and West Banks of the Flower Garden Banks National Marine Sanctuary (FGBNMS)\u00a0
Sampling Events:\u00a0
NOAA FGBNMS Cruise July 2016, NOAA FGBNMS Cruise August 2016
Hurricane Harvey FGB October 2017, Hurricane Harvey FGB October 2018
Methodology:
V4-16S bacterial communities libraries were prepared and PE 250bp reads were generated using Illumina MiSeq platform.\u00a0
Sampling and analytical procedures:
Samples were flash frozen in liquid nitrogen and stored at -20\u2103 until further processing. DNA was extracted from 250 mg of sponge sample using the Nucleospin Soil DNA extraction kit (Takara Bio) or the DNeasy PowerSoil DNA extraction kit (QIAGEN).\u00a0\u00a0 awards_0_award_nid=746813 awards_0_award_number=OCE-1800914 awards_0_data_url=http://www.nsf.gov/awardsearch/showAward.do?AwardNumber=1800914 awards_0_funder_name=NSF Division of Ocean Sciences awards_0_funding_acronym=NSF OCE awards_0_funding_source_nid=355 awards_0_program_manager=Daniel Thornhill awards_0_program_manager_nid=722161 cdm_data_type=Other comment=Sponge-Associated Microbial Communities (via 16S-V4 rRNA amplicon sequencing) Following Storm-Driven Flooding PI: Adrienne Simoes Correa Data Version 1: 2020-07-23 Conventions=COARDS, CF-1.6, ACDD-1.3 data_source=extract_data_as_tsv version 2.3 19 Dec 2019 dataset_current_state=Final and no updates defaultDataQuery=&time<now doi=10.26008/1912/bco-dmo.817436.1 Easternmost_Easting=-93.6002 geospatial_lat_max=27.9078 geospatial_lat_min=27.8819 geospatial_lat_units=degrees_north geospatial_lon_max=-93.6002 geospatial_lon_min=-93.62829 geospatial_lon_units=degrees_east geospatial_vertical_max=28.0 geospatial_vertical_min=16.5 geospatial_vertical_positive=down geospatial_vertical_units=m infoUrl=https://www.bco-dmo.org/dataset/817436 institution=BCO-DMO instruments_0_acronym=Automated Sequencer instruments_0_dataset_instrument_nid=817437 instruments_0_description=General term for a laboratory instrument used for deciphering the order of bases in a strand of DNA. Sanger sequencers detect fluorescence from different dyes that are used to identify the A, C, G, and T extension reactions. Contemporary or Pyrosequencer methods are based on detecting the activity of DNA polymerase (a DNA synthesizing enzyme) with another chemoluminescent enzyme. Essentially, the method allows sequencing of a single strand of DNA by synthesizing the complementary strand along it, one base pair at a time, and detecting which base was actually added at each step. instruments_0_instrument_name=Automated DNA Sequencer instruments_0_instrument_nid=649 instruments_0_supplied_name=Illumina MiSeq Platform metadata_source=https://www.bco-dmo.org/api/dataset/817436 Northernmost_Northing=27.9078 param_mapping={'817436': {'Lat': 'master - latitude', 'Depth_max': 'master - depth', 'Long': 'master - longitude'}} parameter_source=https://www.bco-dmo.org/mapserver/dataset/817436/parameters people_0_affiliation=Rice University people_0_person_name=Adrienne Simoes Correa people_0_person_nid=713331 people_0_role=Principal Investigator people_0_role_type=originator people_1_affiliation=University of Houston people_1_affiliation_acronym=UH-Clearlake people_1_person_name=Lory Santiago-Vazquez people_1_person_nid=746825 people_1_role=Co-Principal Investigator people_1_role_type=originator people_2_affiliation=Rice University people_2_person_name=Amanda Shore people_2_person_nid=817444 people_2_role=Contact people_2_role_type=related people_3_affiliation=Woods Hole Oceanographic Institution people_3_affiliation_acronym=WHOI BCO-DMO people_3_person_name=Amber D. York people_3_person_nid=643627 people_3_role=BCO-DMO Data Manager people_3_role_type=related project=Rapid Reefs Harvey projects_0_acronym=Rapid Reefs Harvey projects_0_description=NSF Award Abstract: Coral reefs are ecologically and economically important ecosystems, and are threatened by a variety of global (climate change) and local (overfishing, pollution) stressors. Anthropogenic climate change is increasing the frequency and severity of storms, which can physically damage reef structures and reduce reef health through changes in seawater quality. In August of 2017, Hurricane Harvey caused widespread flooding in southeast Texas when it released more than 50 trillion liters of rain, which then accumulated along the Texas Shelf. This runoff is expected to impact nearby coral reefs in the Flower Garden Banks National Marine Sanctuary (FGBNMS, northwest Gulf of Mexico) via eddies and jets that transport coastal waters offshore. Findings from this project will allow managers to quickly predict whether extreme storm events are likely to induce reef mortality and ecosystem decline due to freshwater accumulation, by tracking of low salinity water masses coupled with microbial community characterization and metrics of coral health. These data are critical to managing coastal ecosystems, including the high coral cover reefs in the FGBNMS, and will help stakeholders (e.g., diving and fishing communities) plan for and minimize disruption to their livelihoods following these storms. Results will be communicated broadly across scientific arenas, in graduate and undergraduate education and training programs, and to the general public through outreach. The investigators have seven 7 square meter 2-D Reef Replicas from 2014 depicting representative FGBNMS reef bottoms, and will construct additional 2-D Reef Replicas from both banks following the arrival of Harvey runoff, allowing the public to directly experience and quantify the effects of Hurricane Harvey on local reefs using quadrats and identification guides. This project will also synergize with NSF REU programs at Boston University and Texas A&M University, providing transformative research experiences for undergraduates. One post-doctoral scholar, four graduate students, a technician and more than 5 undergraduates will be involved in all aspects of the research. All datasets will be made freely available to the public, and will serve as an important set of baselines for future lines of inquiry into the processes by which hurricanes and other extreme storms impact reef health. Hurricanes and other extreme storm events can decimate coral reefs through wave-driven physical damage. Freshwater runoff from extreme storms is also potentially detrimental to reefs but has received comparatively less attention. This research will provide unprecedented resolution on how hurricanes and other extreme storm events may trigger cascading interactions among water chemistry, declines in metazoan health and shifts in their associated microbial communities, ultimately resulting in coral reef decline. The freshwater runoff initiated by Hurricane Harvey is likely to impact reefs within the FGBNMS, one of the few remaining coral-dominated reefs in the greater Caribbean. The effects of Harvey runoff will be compared to a previously documented storm-driven runoff event that was associated with invertebrate mortality on the same reef system. Sampling seawater chemistry, microbial communities (water column and benthic), and host gene expression and proteomics before, immediately after, and six months after Harvey runoff enters the FGBNMS will allow us to identify commonalities among large-scale freshwater runoff events and track the response of benthic invertebrate health, microbial community diversity, and the trajectory of reef community recovery or decline. The investigators will determine if changes in water chemistry induce pelagic microbial shifts, if microbial communities typically associated with corals and sponges are altered, and whether feedbacks occur between these potential drivers of benthic invertebrate mortality. projects_0_end_date=2019-11 projects_0_geolocation=Flower Garden Banks National Marine Sanctuary, Northwest Gulf of Mexico projects_0_name=RAPID: Collaborative Research: Impact of freshwater runoff from Hurricane Harvey on coral reef benthic organisms and associated microbial communities projects_0_project_nid=746814 projects_0_start_date=2017-12 sourceUrl=(local files) Southernmost_Northing=27.8819 standard_name_vocabulary=CF Standard Name Table v55 version=1 Westernmost_Easting=-93.62829 xml_source=osprey2erddap.update_xml() v1.5
Facebook
Twitterhttps://creativecommons.org/publicdomain/zero/1.0/https://creativecommons.org/publicdomain/zero/1.0/
Cannabis is a genus of flowering plants in the family Cannabaceae.
Source: https://en.wikipedia.org/wiki/Cannabis
In October 2016, Phylos Bioscience released a genomic open dataset of approximately 850 strains of Cannabis via the Open Cannabis Project. In combination with other genomics datasets made available by Courtagen Life Sciences, Michigan State University, NCBI, Sunrise Medicinal, University of Calgary, University of Toronto, and Yunnan Academy of Agricultural Sciences, the total amount of publicly available data exceeds 1,000 samples taken from nearly as many unique strains.
These data were retrieved from the National Center for Biotechnology Information’s Sequence Read Archive (NCBI SRA), processed using the BWA aligner and FreeBayes variant caller, indexed with the Google Genomics API, and exported to BigQuery for analysis. Data are available directly from Google Cloud Storage at gs://gcs-public-data--genomics/cannabis, as well as via the Google Genomics API as dataset ID 918853309083001239, and an additional duplicated subset of only transcriptome data as dataset ID 94241232795910911, as well as in the BigQuery dataset bigquery-public-data:genomics_cannabis.
All tables in the Cannabis Genomes Project dataset have a suffix like _201703. The suffix is referred to as [BUILD_DATE] in the descriptions below. The dataset is updated frequently as new releases become available.
The following tables are included in the Cannabis Genomes Project dataset:
Sample_info contains fields extracted for each SRA sample, including the SRA sample ID and other data that give indications about the type of sample. Sample types include: strain, library prep methods, and sequencing technology. See SRP008673 for an example of upstream sample data. SRP008673 is the University of Toronto sequencing of Cannabis Sativa subspecies Purple Kush.
MNPR01_reference_[BUILD_DATE] contains reference sequence names and lengths for the draft assembly of Cannabis Sativa subspecies Cannatonic produced by Phylos Bioscience. This table contains contig identifiers and their lengths.
MNPR01_[BUILD_DATE] contains variant calls for all included samples and types (genomic, transcriptomic) aligned to the MNPR01_reference_[BUILD_DATE] table. Samples can be found in the sample_info table. The MNPR01_[BUILD_DATE] table is exported using the Google Genomics BigQuery variants schema. This table is useful for general analysis of the Cannabis genome.
MNPR01_transcriptome_[BUILD_DATE] is similar to the MNPR01_[BUILD_DATE] table, but it includes only the subset transcriptomic samples. This table is useful for transcribed gene-level analysis of the Cannabis genome.
Fork this kernel to get started with this dataset.
Dataset Source: http://opencannabisproject.org/ Category: Genomics Use: This dataset is publicly available for anyone to use under the following terms provided by the Dataset Source - https://www.ncbi.nlm.nih.gov/home/about/policies.shtml - and is provided "AS IS" without any warranty, express or implied, from Google. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset. Update frequency: As additional data are released to GenBank View in BigQuery: https://bigquery.cloud.google.com/dataset/bigquery-public-data:genomics_cannabis View in Google Cloud Storage: gs://gcs-public-data--genomics/cannabis
Banner Photo by Rick Proctor from Unplash.
Which Cannabis samples are included in the variants table?
Which contigs in the MNPR01_reference_[BUILD_DATE] table have the highest density of variants?
How many variants does each sample have at the THC Synthase gene (THCA1) locus?
Facebook
TwitterAttribution 4.0 (CC BY 4.0)https://creativecommons.org/licenses/by/4.0/
License information was derived automatically
In this repository we keep internal data for the microbetag microbial co-occurrence network annotator.
microbetag makes use of 2-column files for each genome, indicating the KO term found and a KEGG module in which this terms takes part into.
As a single KO term might participates in more than one KEGG modules, the same KO might be more than once in an annotation file.
|
md5:e3e62b305e64b27da7b80655d7f92f2c
|
for all the GTDB genomes their corresponding PATRIC annotations were gathered. Then, using modelseedpy we constructed their genome scale metabolic reconstructions |
|
md5:cbcc9aa1a28a5bd5f6661f832d27bcbf
|
all representative genomes of GTDB (v.202) were parsed and their corresponding `.faa` files were retrieved from the NCBI FTP. Then the kofam_scan tool was used to annotate them and finally a manual script was used to keep KOs of each genome per module. |
|
A pickle file with the seeds of each GEM included in the gtdb_modelseed_gems.zip file and related to the KEGG MODULES based on the seedId_keggId_module.tsv file you can find on microbetag's GitHub page. Example: PATRIC SeedSet | |
|
A pickle file with the non seeds of each GEM included in the gtdb_modelseed_gems.zip file and related to the KEGG MODULES based on the seedId_keggId_module.tsv file you can find on microbetag's GitHub page. Example: PATRIC NonSeedSet | |
|
md5:9e3f7a84fe7409ef0282ca5424797976
| A list of pickle files with the re-trained classes of phenDB for the prediction of functional traits on a genome. |
Facebook
TwitterThis dataset has BioSample (SAMN accession id) submitter and date information. This information has been gathered from NCBI. Only BioSample accession ids mapped by EUPMC by 04.09.2025 have been included. If you want to include accession ids from newer mappings, you need to redownload the biosample.csv from https://europepmc.org/pub/databases/pmc/TextMinedTerms/
Here is how to recreate the biosample_info.parquet dataset:
import pandas as pd
from Bio import Entrez
import time
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor
import requests
from urllib.parse import urlencode
import os
from dotenv import load_dotenv
import joblib
# Load environment variables from .env file
load_dotenv()
# Load the biosample IDs from joblib file, these are BioSample mapped accession_ids from EuropePMC downloaded at 04.09.2025.
# If you want newer data, you need to download the EUPMC biosample.csv file again and extract unique biosample IDs like this:
# df_biosample = pd.read_csv("biosample.csv") # Edit the path if needed
# biosample_unique_in_eupmc = df_biosample.biosample.unique()
biosample_unique_in_eupmc = joblib.load('biosample_unique_in_eupmc.joblib') # Edit path if needed or remove if you redownloaded the biosample.csv
# Set email (required by NCBI) and API key for higher rate limits
Entrez.email = "your_email" # Make sure to add your e-mail here
Entrez.api_key = os.getenv('NCBI_API_KEY') # Make sure to add this to your .env file, you can get it with you sign up in NCBI, it's free
if Entrez.api_key:
print("✓ NCBI API key loaded successfully - using 10 requests/second limit")
else:
print("⚠ No NCBI API key found - using 3 requests/second limit")
def process_batch(batch_ids):
"""Process a batch of biosample IDs with NCBI compliance"""
try:
id_list = ','.join(batch_ids)
# Use direct URL approach with proper NCBI compliance
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
'db': 'biosample',
'id': id_list,
'retmode': 'xml',
'email': Entrez.email,
'tool': 'python_script' # Identify your tool
}
# Add API key if available
if hasattr(Entrez, 'api_key') and Entrez.api_key:
params['api_key'] = Entrez.api_key
response = requests.get(base_url, params=params, timeout=30)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
batch_results = {}
for biosample in root.findall('.//BioSample'):
acc = biosample.get('accession')
# Simplified extraction
submitter_parts = []
# Organization
owner = biosample.find('.//Owner/Name')
if owner is not None and owner.text:
submitter_parts.append(owner.text)
# Contact person (simplified)
contact = biosample.find('.//Owner/Contacts/Contact')
if contact is not None:
first = contact.find('.//Name/First')
last = contact.find('.//Name/Last')
if first is not None and last is not None and first.text and last.text:
submitter_parts.append(f"{first.text} {last.text}")
submitter = ", ".join(submitter_parts) if submitter_parts else "Unknown"
# Date (simplified)
date = biosample.get('submission_date', biosample.get('publication_date', 'Unknown'))
if date != 'Unknown':
date = date.split('T')[0]
if acc:
batch_results[acc] = f"{submitter}; {date}"
return batch_results
except Exception as e:
print(f"Error in batch: {e}")
return {}
# For faster processing with all IDs
def process_all_biosamples_fast(biosample_ids, batch_size=200, max_workers=1):
"""Process all biosamples with NCBI-compliant rate limiting"""
print(f"Processing {len(biosample_ids)} biosamples...")
print("Following NCBI guidelines: max 3 requests/second (or 10 with API key)")
# Create batches (NCBI recommends 200-500 IDs per batch)
batches = [biosample_ids[i:i+batch_size] for i in range(0, len(biosample_ids), batch_size)]
all_results = {}
processed = 0
# Calculate delay based on whether API key is available
has_api_key = hasattr(Entrez, 'api_key') and Entrez.api_key
delay = 0.1 if has_api_key else 0.34 # 10/sec with key, 3/sec without
print(f"Using {delay}s delay between requests ({'with' if has_api_key else 'without'} API key)")
# Process batches sequentially to respect rate limits
for i, batch in enumerate(batches):
print(f...