So, if any of you are trying to figure out how to download all 10k+ Bacterial genomes (or anything in batch) and are working on a remote UNIX system, here's how to do it:
Step 1:
Download ftputil from http://ftputil.sschwarzer.net/trac
If you're unfamiliar with Python (as I am), here's how to install it:
tar -xzf ftputil-*.tar.gz cd ftputil-* ./setup.py install --root ~ # This will install it in ~/usr/lib if you don't have install permissions for /usr/lib)
Step 2:
Create a file called get_gens.py (or whatever you want), and put this in it (I've changed the original code to download all the .fna files. It's easy to change to download something different.)
#! /usr/bin/python
# append the library where ftputil is located
import sys
sys.path.append("~/usr/lib/python2.4/site-packages/")
import ftputil
import string
import os
#Where to put the genomes (script will create sub directories
#using the same names as the NCBI use). This directory must
#exist already:
base_path="~/genomes/Bacteria/"
host = ftputil.FTPHost('ftp.ncbi.nlm.nih.gov', 'anonymous', 'password')
host.chdir('/genomes/Bacteria/')
dir_list = host.listdir(host.curdir)
for dir_name in dir_list :
host.chdir('/genomes/Bacteria/')
if host.path.isdir(dir_name):
print dir_name
host.chdir('/genomes/Bacteria/' + dir_name + '/')
file_list = host.listdir(host.curdir)
for file_name in file_list :
#if file_name[-4:]==".gbk" :
if file_name[-4:]==".fna" :
print "File " + file_name
if not os.path.isdir(os.path.join(base_path,dir_name)) :
print "Making directory " + os.path.join(base_path,dir_name)
os.chdir(base_path)
os.mkdir(os.path.join(base_path,dir_name))
if os.path.isfile(os.path.join(base_path,dir_name,file_name)) :
print "Skiping file " \
+ os.path.join(base_path,dir_name,file_name)
elif host.path.isfile(file_name) :
print "Downloading file " \
+ os.path.join(base_path,dir_name,file_name)
host.download(file_name, \
os.path.join(base_path,dir_name,file_name), 't')
#Download arguments: remote filename, local filename, mode
else :
print "ERROR - Not a file " + dir_name + "/" + file_name