#!/usr/bin/perl -s
#*************************************************************************
#
# Program: prodoric
# File: prodoric.pl
#
# Version: V1.0
# Date: 09.06.11
# Function: Query the Prodoric web server for prokaryotic promoter
# binding sites
#
# Copyright: (c) Dr. Andrew C. R. Martin 2011
# Author: Dr. Andrew C. R. Martin
# Address: Institute of Structural and Molecular Biology
# Division of Biosciences,
# University College,
# Gower Street,
# London.
# WC1E 6BT.
# EMail: andrew@bioinf.org.uk
#
#*************************************************************************
#
# This program is not in the public domain, but it may be copied
# according to the conditions laid out in the accompanying file
# COPYING.DOC
#
# The code may be modified as required, but any modifications must be
# documented so that the person responsible can be identified. If
# someone else breaks this code, I don't want to be blamed for code
# that does not work!
#
# The code may not be sold commercially or included as part of a
# commercial product except as described in the file COPYING.DOC.
#
#*************************************************************************
#
# Description:
# ============
# Submits a sequence to the Prodoric bacterial promoter prediction
# server at http://prodoric.tu-bs.de/vfp/vfp_promoter.php
#
#*************************************************************************
#
# Usage:
# ======
# See Usage message with
# prodoric -h
#*************************************************************************
#
# Revision History:
# =================
# V1.0 09.06.11 Original By: ACRM
#
#*************************************************************************
use LWP::UserAgent;
use strict;
# Create global arrays of list of species and list of PWMs
Setup();
# Parse the command line getting list of species indexes
my $species = ParseCmdLine();
# Build list of PWMs for these species. This comes back as an array of chunks
my @pwms = BuildPwms($species,1000);
# Read the sequence from the FASTA file
my $seq = ReadSeq();
# Print a results header
print "# PWM, start, stop, score, sequence\n";
# Run Prodoric
my $results = Prodoric($seq, @pwms);
# Print the results
PrintResults($results);
########################################################################
# Prints the results, obeying print options:
# -t=threshold (specify threshold)
# -c (just count those above the threshold)
# -u (just count distinct promoters above the threshold)
#
# 09.06.11 Original By: ACRM
sub PrintResults
{
my($allResults) = (@_);
my @results;
my $threshold = 0;
my %printed = ();
@results = split(/\n/, $results);
if(defined($::t))
{
$threshold = $::t;
}
my $count = 0;
foreach my $result (@results)
{
my @fields = split(/\,/,$result);
if($fields[3] >= $threshold)
{
if(defined($::u))
{
if(!defined($printed{$fields[0]}))
{
$printed{$fields[0]} = 1;
$count++;
}
}
else
{
$count++;
}
if(!defined($::c) && !defined($::u))
{
print "$result\n";
}
}
}
if(defined($::u))
{
print "Number of unique PWM matches above threshold ($threshold) = $count\n";
}
elsif(defined($::c))
{
print "Number of matches above threshold ($threshold) = $count\n";
}
}
########################################################################
# Called with the sequence and an array of Prodoric PWM name sets. Each
# name set uses a # to delimit each PWM name - i.e. name#name#name#
# (note trailing #)
# Returns a set of results in a string
#
# 09.06.11 Original By: ACRM
sub Prodoric
{
my($seq, @pwms) = @_;
my $results = "";
foreach my $pwms (@pwms)
{
if(defined($::v))
{
my $tmp = $pwms;
$tmp =~ s/\#/\n /g;
print STDERR "Testing:\n $tmp\n";
}
$results .= doProdoric($seq, $pwms);
}
return($results);
}
########################################################################
# Read the first sequence from a FASTA file
# 09.06.11 Original By: ACRM
sub ReadSeq
{
my $seq = "";
while(<>)
{
chomp;
if(/^\>/)
{
if($seq ne "")
{
return($seq);
}
}
else
{
s/\s//g;
$seq .= $_;
}
}
return($seq);
}
########################################################################
# Does the work of running Prodoric on an individual PWM nameset
# 09.06.11 Original By: ACRM
#
sub doProdoric
{
my ($sequence, $pwms) = @_;
my $url="http://prodoric.tu-bs.de/vfp/vfp_promoter_start.php";
my $webproxy = "";
my %options;
$options{genome} = "PLEASE SELECT...";
$options{gene_name} = "";
$options{upstream} = 500;
$options{genome_pos1} = "";
$options{genome_pos2} = "";
$options{userfile} = "";
$options{sequence} = $sequence;
$options{pwm_data} = $pwms;
$options{seq_source} = 3; # paste raw sequence
$options{msens1} = $::sensitivity; # Sensitivity 0.7-1.0 (default 0.8)
$options{nop} = ($::nonop?0:1); # Non-occurrence penalty (default 1)
$options{csens1} = $::core; # Core sensitivty 0.7-1.0 (default 0.9)
$options{core1} = $::size; # Core size 3-8 (default 5)
$options{constraint} = 2; # Show all results for each PWM
# $options{constraint} = 1; # Show top results for each PWM
$options{max_results} = 3; # How many to show for each PWM
$options{MAX_FILE_SIZE} = 100000000;
# build options into post string
my $post = "";
foreach my $key (keys %options)
{
if($post eq "")
{
$post = "$key=$options{$key}";
}
else
{
$post .= "&";
$post .= "$key=$options{$key}";
}
}
if(defined($::v))
{
print STDERR "Submitting data to Prodoric\n";
}
# Submit
my $ua = CreateUserAgent($webproxy);
my $req = CreatePostRequest($url, $post);
my $html = GetContent($ua, $req);
# Find the URL for the results page from the html
$html =~ /\/tmp\/result(\d+)\.html/;
my $id = $1;
# Keep refreshing the results page until we have the final data
do
{
sleep 1;
$url = "http://prodoric.tu-bs.de/tmp/result$id.html";
if(defined($::v))
{
print STDERR "Regetting $url...\n";
}
$req = CreateGetRequest($url, "");
$html = GetContent($ua, $req);
# Find the URL for the results page from the html
$id = "";
if($html =~ /\/tmp\/result(\d+)\.html/)
{
$id = $1;
}
} while($id ne "");
if(defined($::v))
{
print STDERR "Parsing results...\n";
}
my $results = ParseProdoricHTML($html);
return($results);
}
########################################################################
# Get data from a website using LWP
sub GetContent
{
my($ua, $req) = @_;
my($res);
$res = $ua->request($req);
if($res->is_success)
{
return($res->content);
}
return(undef);
}
########################################################################
# Create a get request using LWP
sub CreateGetRequest
{
my($url) = @_;
my($req);
$req = HTTP::Request->new('GET',$url);
return($req);
}
########################################################################
# Create a post request using LWP
sub CreatePostRequest
{
my($url, $params) = @_;
my($req);
$req = HTTP::Request->new(POST => $url);
$req->content_type('application/x-www-form-urlencoded');
$req->content($params);
return($req);
}
########################################################################
# Create a web agent using LWP
sub CreateUserAgent
{
my($webproxy) = @_;
my($ua);
$ua = LWP::UserAgent->new;
if(length($webproxy))
{
$ua->proxy(['http', 'ftp'] => $webproxy);
}
return($ua);
}
########################################################################
# Parse the HTML that is returned by Prodoric
# 09.06.11 Original By: ACRM
sub ParseProdoricHTML
{
my($html) = @_;
# Create line breaks at all table boundaries
$html =~ s/
/\n | /g;
# Extract HTML from the table with "PWM (species)" in the header
my @lines = split(/\n/, $html);
my $inSection = 0;
my @resultsTable;
foreach my $line (@lines)
{
if($line =~ /PWM \(species\)/)
{
$inSection = 1;
}
elsif($line =~ /<\/table>/)
{
$inSection = 0;
}
if($inSection)
{
push @resultsTable, $line;
}
}
# Parse each row from the table into an array, then assemble that into an output string
my $results = "";
my $count = 0;
my @fields = ();
foreach my $line (@resultsTable)
{
if($line =~ /\
/)
{
if($count)
{
# Extract data of interest
if($fields[3] eq "+")
{
$results .= "$fields[0], $fields[1], $fields[2], $fields[4], $fields[5]\n";
}
}
$count = 0;
}
elsif($line =~ /tablerow_white\".*\>(.*)\<\/td\>/)
{
$fields[$count] = $1;
$fields[$count] =~ s/\,//g; # Strip commas from fields
$count++;
}
elsif($line =~ /seq\".*\>(.*)\<\/td\>/)
{
$fields[$count] = $1;
$fields[$count] =~ s/\,//g; # Strip commas from fields
$count++;
}
}
# And the last one...
if($count)
{
# Extract data of interest
if($fields[3] eq "+")
{
$results .= "$fields[0], $fields[1], $fields[2], $fields[4], $fields[5]\n";
}
}
return($results);
}
########################################################################
# Build an array containing PWM namesets - each nameset being a
# #-delimited set of Prodoric PWM names
#
# 09.06.11 Original By: ACRM
sub BuildPwms
{
my ($specList, $chunkSize) = @_;
my @species=split(/\,/, $specList);
my @pwms = ();
my $pwmSet = 0;
my $count = 0;
foreach my $spec (@species)
{
my $specName = $::speciesList[$spec];
foreach my $pwm (@::pwmList)
{
my $pwm2 = $pwm;
$pwm2 =~ s/.*\|\s+//;
if($pwm2 eq $specName)
{
$count++;
$pwms[$pwmSet] .= "$pwm#";
if(!($count%$chunkSize))
{
$pwmSet++;
}
}
}
}
if(defined($::v))
{
print STDERR "Using $count PWMs\n";
}
return(@pwms);
}
########################################################################
# Parse the command line
#
# 09.06.11 Original By: ACRM
sub ParseCmdLine
{
my $species = "";
if(defined($::h) || defined($::help))
{
Usage();
exit 0;
}
if(!defined($::species))
{
if(defined($::all))
{
$species .= "1,2,3,4,5,6,7,8,9,10,11,12";
}
else
{
if(defined($::bac))
{
$species .= "0,1,";
}
if(defined($::bra))
{
$species .= "2,";
}
if(defined($::eco))
{
$species .= "3,4,";
}
if(defined($::hel))
{
$species .= "5,";
}
if(defined($::lis))
{
$species .= "6,";
}
if(defined($::pse))
{
$species .= "7,8,9,";
}
if(defined($::sal))
{
$species .= "10,";
}
if(defined($::str))
{
$species .= "11,12,";
}
$species =~ s/\,$//;
}
}
else
{
$species = $::species;
}
# Default to E.coli
if($species eq "")
{
$species = "3,4";
}
$::sensitivity = 0.8 if(!defined($::sensitivity));
$::nonop = 0 if(!defined($::nonop));
$::core = 0.9 if(!defined($::core));
$::size = 5 if(!defined($::size));
if(defined($::v))
{
print STDERR "Sensitivity: $::sensitivity\n";
printf STDERR "Nop: %s\n", $::nonop?"off":"on";
print STDERR "Core Sensitivity: $::core\n";
print STDERR "Core Size: $::size\n";
}
return($species);
}
########################################################################
# Print a usage message
#
# 09.06.11 Original By: ACRM
sub Usage
{
print <<__EOF;
prodoric (V1.0), (c) 2010, Dr. Andrew C.R. Martin, UCL
Usage: prodoric [-v] [-c] [-t=threshold] [-species=n[,n...]]
[-eco][-bac][-pse][-bra][-hel][-lis][-sal][-str]
[-sensitivity=n] [-nonop] [-core=n] [-size=n]
file.faa
-v Verbose
-c Just return the count of matches found
-u Just return the count of distinct TFBSs found
(overrides -c)
-t Only return results with score >= threshold
-all Predict for all known species
-eco Predict for Escherichia coli
-bac Predict for Bacillus subtilis
-bra Predict for Bradyrhizobium japonicum
-hel Predict for Helicobacter pylori
-lis Predict for Listeria monocytogenes
-pse Predict for Pseudomonas aeruginosa and Pseudomonas syringae
-sal Predict for Salmonella typhi
-str Predict for Streptococcus pyogenes
-species Predict for specific variants / strains
Use as -species=1,3,5
This overrides anything specified with -all, -eco, etc
__EOF
my $count = 0;
foreach my $spec (@::speciesList)
{
printf " %2d $spec\n", $count++;
}
print <<__EOF;
(Specied defaults to E.coli if nothing specified.)
-sensitivity Specify sensitivity (range: 0.7-1.0; default: 0.8)
-nonop Do not apply a non-occurence penalty
-core Specify the core sensitivity (range: 0.7-1.0; default: 0.9)
-size Specify the core size (range: 3-8; default: 5)
prodoric runs the Prodoric webserver at http://prodoric.tu-bs.de/vfp/vfp_promoter.php
from the command line.
See http://prodoric.tu-bs.de/vfp/vfp_help.php for the meaning of the -sensitivity,
-nonop, -core and -size flags
__EOF
}
########################################################################
# Define PWM list and species list from the Prodoric web site
# Note that
# "Lrp + Leu (SELEX) | Escherichia coli (strain K12)"
# causes the program to freeze on uploading the data - probably the '+'
# sign does something special and needs masking
#
# 09.06.11 Original By: ACRM
sub Setup
{
@::pwmList = ("AbrB | Bacillus subtilis (strain 168)",
"Ada | Escherichia coli (strain K12)",
"AhrC | Bacillus subtilis (strain 168)",
"AlgR | Pseudomonas syringae (pathovar tomato, strain DC3000)",
"AlgU (-10) | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"AlgU (-35) | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"AlgU (N16) | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"AlgU (N17) | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"Anr | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"Anr_Dnr_37 | Pseudomonas aeruginosa",
"Anr_Dnr | Pseudomonas aeruginosa",
"Anr_Dnr_40 | Pseudomonas aeruginosa",
"AraC | Escherichia coli (strain K12)",
"AraR | Bacillus subtilis (strain 168)",
"ArcA | Escherichia coli (strain K12)",
"ArgR | Escherichia coli (strain K12)",
"ArgR | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"CRE | Bacillus subtilis (strain 168)",
"CTSR | Bacillus subtilis (strain 168)",
"CaiF | Escherichia coli (strain K12)",
"CcpC | Bacillus subtilis (strain 168)",
"CitT | Bacillus subtilis (strain 168)",
"CodY | Bacillus subtilis (strain 168)",
"ComA | Bacillus subtilis (strain 168)",
"ComK | Bacillus subtilis (strain 168)",
"CovR | Streptococcus pyogenes (serovar M18 / M18, strain MGAS8232)",
"CovR | Streptococcus pyogenes (serovar M18 / M18, strain MGAS8232)",
"CpxR | Escherichia coli (strain K12)",
"Crp | Escherichia coli (strain K12)",
"CspA | Escherichia coli (strain K12)",
"CtsR | Listeria monocytogenes (serovar 1/2a, strain EGD-e)",
"CynR | Escherichia coli (strain K12)",
"CysB | Escherichia coli (strain K12)",
"CytR | Escherichia coli (strain K12)",
"DegU | Bacillus subtilis (strain 168)",
"DeoR | Escherichia coli (strain K12)",
"DinR/LexA | Bacillus subtilis (strain 168)",
"DnaA | Escherichia coli (strain K12)",
"Dnr | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"ExsA | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"FadR | Escherichia coli (strain K12)",
"FhlA | Escherichia coli (strain K12)",
"Fis | Escherichia coli (strain K12)",
"FleQ | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"FlhD2C2 | Escherichia coli (strain K12)",
"FliA | Escherichia coli (strain K12)",
"Fnr | Bacillus subtilis (strain 168)",
"Fnr_neu | Bacillus subtilis",
"Fnr | Escherichia coli (strain K12)",
"FruR | Escherichia coli (strain K12)",
"Fur (18mer) | Escherichia coli (strain K12)",
"Fur (8mer)| Escherichia coli (strain K12)",
"Fur | Helicobacter pylori (strain ATCC 700392 / 26695)",
"Fur | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"GalR | Escherichia coli (strain K12)",
"GalS | Escherichia coli (strain K12)",
"GcvA | Escherichia coli (strain K12)",
"GerE | Bacillus subtilis (strain 168)",
"GlnG | Escherichia coli (strain K12)",
"GlnR | Bacillus subtilis (strain 168)",
"GlpR | Escherichia coli (strain K12)",
"GlpR | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"GltC | Bacillus subtilis (strain 168)",
"GltR | Bacillus subtilis (strain 168)",
"GntR | Escherichia coli (strain K12)",
"H-NS | Escherichia coli (strain K12)",
"Hpr | Bacillus subtilis (strain 168)",
"IHF | Escherichia coli (strain K12)",
"IHF | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"IciA | Escherichia coli (strain K12)",
"IlvY | Escherichia coli (strain K12)",
"LacI | Escherichia coli (strain K12)",
"LasR | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"LevR | Bacillus subtilis (strain 168)",
"LexA | Escherichia coli (strain K12)",
"LicT | Bacillus subtilis (strain 168)",
"Lrp | Escherichia coli (strain K12)",
"Lrp (SELEX) | Escherichia coli (strain K12)",
# Commented out as it seems to freeze on this
# "Lrp + Leu (SELEX) | Escherichia coli (strain K12)",
"MalI | Escherichia coli (strain K12)",
"MalT | Escherichia coli (strain K12)",
"MarR | Escherichia coli (strain K12)",
"MetJ | Escherichia coli (strain K12)",
"MetJ (Selex) | Escherichia coli (strain K12)",
"MetJ (Selex) | Escherichia coli (strain K12)",
"MetR | Escherichia coli (strain K12)",
"MexR | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"Mlc | Escherichia coli (strain K12)",
"Mlc (Selex) | Escherichia coli (strain K12)",
"MngR (former FarR) | Escherichia coli (strain K12)",
"MntR | Bacillus subtilis (strain 168)",
"ModE | Escherichia coli (strain K12)",
"Mta | Bacillus subtilis (strain 168)",
"MtrB | Bacillus subtilis (strain 168)",
"Nac | Escherichia coli (strain K12)",
"NagC | Escherichia coli (strain K12)",
"NagC (Selex) | Escherichia coli (strain K12)",
"NarL | Escherichia coli (strain K12)",
"NarL | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"NhaR | Escherichia coli (strain K12)",
"OmpR (C box)| Escherichia coli (strain K12)",
"OmpR (F box)| Escherichia coli (strain K12)",
"OmpR | Escherichia coli (strain K12)",
"OmpR | Helicobacter pylori (strain ATCC 700392 / 26695)",
"OxyR | Escherichia coli (strain K12)",
"OxyR | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"OxyR (SELEX) | Escherichia coli (strain K12)",
"PdhR | Escherichia coli (strain K12)",
"PhhR | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"PhoP | Escherichia coli (strain K12)",
"PrfA | Listeria monocytogenes (serovar 1/2a strain EGD-e)",
"PucR | Bacillus subtilis (strain 168)",
"PurR | Escherichia coli (strain K12)",
"PvdS | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"PyrR | Bacillus subtilis (strain 168)",
"RcsAB | Escherichia coli (strain K12)",
"RegR | Bradyrhizobium japonicum (strain USDA 110)",
"ResD | Bacillus subtilis (strain 168)",
"Rex | Bacillus subtilis (strain 168)",
"RhlR | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"RocR | Bacillus subtilis (strain 168)",
"RofA | Streptococcus pyogenes (serovar M1, strain SF370 / ATCC 700294)",
"RpoE-SigE | Escherichia coli (strain K12)",
"RpoE-SigE | Salmonella typhi (strain CT18)",
"RpoN | Bradyrhizobium japonicum (strain USDA 110)",
"RpoN | Escherichia coli (strain K12)",
"Sig70 (-10) | Escherichia coli",
"SigB (-10) | Bacillus subtilis (strain 168)",
"SigB (-35) | Bacillus subtilis (strain 168)",
"SigB (N12) | Bacillus subtilis (strain 168)",
"SigB (N13) | Bacillus subtilis (strain 168)",
"SigB (N14) | Bacillus subtilis (strain 168)",
"SigB (N15) | Bacillus subtilis (strain 168)",
"SigD | Bacillus subtilis (strain 168)",
"SigE (-10) | Bacillus subtilis (strain 168)",
"SigE (-35) | Bacillus subtilis (strain 168)",
"SigE (N13) | Bacillus subtilis (strain 168)",
"SigE (N14) | Bacillus subtilis (strain 168)",
"SigE (N15) | Bacillus subtilis (strain 168)",
"SigH (-10) | Bacillus subtilis (strain 168)",
"SigH (-35) | Bacillus subtilis (strain 168)",
"SigL | Bacillus subtilis (strain 168)",
"SigW (-10) | Bacillus subtilis (strain 168)",
"SigW (-35) | Bacillus subtilis (strain 168)",
"SigW (N16) | Bacillus subtilis (strain 168)",
"SigW (N17) | Bacillus subtilis (strain 168)",
"SigX (-10) | Bacillus subtilis (strain 168)",
"SigX (-35) | Bacillus subtilis (strain 168)",
"SigX (N15) | Bacillus subtilis (strain 168)",
"SinR | Bacillus subtilis (strain 168)",
"Spo0A | Bacillus subtilis (strain 168)",
"Spo0A (II) | Bacillus subtilis (strain 168)",
"SpoIIID | Bacillus subtilis (strain 168)",
"TnrA | Bacillus subtilis (strain 168)",
"TreR | Bacillus subtilis (strain 168)",
"TrpI | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"Vfr | Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"Xre | Bacillus subtilis (strain 168)",
"XylR | Bacillus subtilis (strain 168)",
"YhiX | Escherichia coli (strain K12)",
"Zur | Bacillus subtilis (strain 168)");
@::speciesList = ("Bacillus subtilis",
"Bacillus subtilis (strain 168)",
"Bradyrhizobium japonicum (strain USDA 110)",
"Escherichia coli",
"Escherichia coli (strain K12)",
"Helicobacter pylori (strain ATCC 700392 / 26695)",
"Listeria monocytogenes (serovar 1/2a, strain EGD-e)",
"Pseudomonas aeruginosa",
"Pseudomonas aeruginosa (strain ATCC 15692 / PAO1)",
"Pseudomonas syringae (pathovar tomato, strain DC3000)",
"Salmonella typhi (strain CT18)",
"Streptococcus pyogenes (serovar M18 / M18, strain MGAS8232)",
"Streptococcus pyogenes (serovar M1, strain SF370 / ATCC 700294)");
}