ExecutionΒΆ

The pipeline built here could be put all together in a dodo.py file:

from judi import File, Task, add_param, combine_csvs

add_param('100 101 102 103'.split(), 'sample')
add_param('1 2'.split(), 'group')

REF = 'hg_refs/hg19.fa'
path_gen = lambda x: '{}_{}.fq'.format(x['sample'],x['group'])

class AlignFastq(Task):
  inputs = {'reads': File('orig_fastq', path = path_gen)}
  targets = {'sai': File('aln.sai')}
  actions = [('bwa aln {} {} > {}', [REF,'$reads','$sai'])]

class CreateBam(Task):
  mask = ['group']
  inputs = {'reads': AlignFastq.inputs['reads'],
            'sai': AlignFastq.targets['sai']}
  targets = {'bam': File('aln.bam', mask = mask)}
  actions = [('bwa sampe {} {} {} | samtools view -Sbh - | samtools sort - > {}', [REF,'$sai','$reads','$bam'])]

class GetCoverage(Task):
  mask = ['group']
  inputs = {'bam': CreateBam.targets['bam']}
  targets = {'cov': File('cov.csv', mask = mask)}
  actions = [('(echo val; samtools rmdup {} - | samtools mpileup - | cut -f4) > {}', ['$bam','$cov'])]

class CombineCoverage(Task):
  mask = ['group', 'sample']
  inputs = {'cov': GetCoverage.targets['cov']}
  targets = {'csv': File('combined.csv', mask = mask),
           'pdf': File('pltcov.pdf', mask = mask, root = '.')}
  actions = [(combine_csvs, ['#cov', '#csv']),
             ("""echo "library(ggplot2); pdf('{}')
              ggplot(read.csv('{}'), aes(x = val)) +
              geom_density(aes(color = factor(sample)))"\
              | R --vanilla""", ['$pdf','$csv'])]

And then executed as follows:

$ doit -f dodo.py

The pipeline can be run using more than one processor by using -n 8 command line option to doit.