Wednesday, February 18, 2015
GATK script by pushkarev
#!/bin/bash
# Copyright (C) 2013-2015 Clusterk Inc
#
# Run Variant calling for a chromosomal region (Deduplications, Indel realignment, Variant Calling, and Variant Annotation to add mq0)
# Input:
# $input - input bam file (e.g. chr3:1-10000.bam)
# $bqsr - base quality recalibration file (bqsr.grp)
# $interval - chromosomal interval (chr3:1-10000)
#
# Output: raw.vcf
set -e
set -x
set -o pipefail
# Abort execution if any dependencies failed
[ "$CLUSTERK_FAILED_DEPS" == "" ] || ( echo Some dependencies failed: $CLUSTERK_FAILED_DEPS. Aborting && exit 1)
[ "$input" != "" ] && input=$(./swe fetch $input)
[ "$gatk_jar" != "" ] && gatk_jar=$(./swe fetch $gatk_jar)
[ "$bqsr" != "" ] && bqsr=$(./swe fetch $bqsr)
[ "$interval" != "" ] && interval_file=$(./swe fetch $interval)
[ "$GATK_REFERENCE" != "" ] && gatk_data=$(./swe dc $GATK_REFERENCE)
interval=$(cat $interval_file)
cpu_cores=32
tabix -h $gatk_data/Mills_and_1000G_gold_standard.indels.hg19.vcf.gz $interval > VCF1.vcf
tabix -h $gatk_data/1000G_phase1.indels.hg19.vcf.gz $interval > VCF2.vcf
tabix -h $gatk_data/dbsnp_137.hg19.vcf.gz $interval > interval.dbsnp_137.hg19.vcf
java -Xmx2g -jar gatk/MarkDuplicates.jar INPUT=$input OUTPUT=deduplicated.bam REMOVE_DUPLICATES=true METRICS_FILE=duplication.metrics
samtools index deduplicated.bam
java -Xmx2g -jar $gatk_jar \
-I deduplicated.bam \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T RealignerTargetCreator \
-o realigned.intervals \
--known VCF1.vcf \
--known VCF2.vcf \
-L $interval
java -Xmx2g -jar $gatk_jar \
-I deduplicated.bam \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T IndelRealigner \
-targetIntervals realigned.intervals \
-o realigned.bam \
-known VCF1.vcf \
-known VCF2.vcf \
--consensusDeterminationModel KNOWNS_ONLY \
-L $interval \
-LOD 0.4 \
--downsample_to_coverage 250 \
-compress 0
java -Xmx2g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-I realigned.bam \
-T PrintReads \
-o recalibrated.bam \
--disable_indel_quals \
-BQSR $bqsr \
-compress 0
if [[ $gatk_jar =~ GenomeAnalysisTKLite ]] ;
then
java -Xmx6g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T UnifiedGenotyper \
-I recalibrated.bam \
--dbsnp interval.dbsnp_137.hg19.vcf \
-o raw.vcf \
-glm BOTH \
-L $interval \
-stand_call_conf 30.0 \
-stand_emit_conf 30.0 \
-nct $cpu_cores \
-rf BadCigar
else
#full featured GATK, run HaplotypeCaller
java -Xmx6g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T HaplotypeCaller \
-I recalibrated.bam \
--dbsnp interval.dbsnp_137.hg19.vcf \
-stand_call_conf 30.0 \
-stand_emit_conf 30.0 \
-o raw.hc.vcf \
-L $interval \
-nct $cpu_cores \
-rf BadCigar
java -Xmx6g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T VariantAnnotator \
-I recalibrated.bam \
--variant raw.hc.vcf \
-A MappingQualityZero \
--dbsnp interval.dbsnp_137.hg19.vcf \
-o raw.vcf \
-L $interval \
-nt $cpu_cores \
-rf BadCigar
fi
./swe emit file raw.vcf
# Copyright (C) 2013-2015 Clusterk Inc
#
# Run Variant calling for a chromosomal region (Deduplications, Indel realignment, Variant Calling, and Variant Annotation to add mq0)
# Input:
# $input - input bam file (e.g. chr3:1-10000.bam)
# $bqsr - base quality recalibration file (bqsr.grp)
# $interval - chromosomal interval (chr3:1-10000)
#
# Output: raw.vcf
set -e
set -x
set -o pipefail
# Abort execution if any dependencies failed
[ "$CLUSTERK_FAILED_DEPS" == "" ] || ( echo Some dependencies failed: $CLUSTERK_FAILED_DEPS. Aborting && exit 1)
[ "$input" != "" ] && input=$(./swe fetch $input)
[ "$gatk_jar" != "" ] && gatk_jar=$(./swe fetch $gatk_jar)
[ "$bqsr" != "" ] && bqsr=$(./swe fetch $bqsr)
[ "$interval" != "" ] && interval_file=$(./swe fetch $interval)
[ "$GATK_REFERENCE" != "" ] && gatk_data=$(./swe dc $GATK_REFERENCE)
interval=$(cat $interval_file)
cpu_cores=32
tabix -h $gatk_data/Mills_and_1000G_gold_standard.indels.hg19.vcf.gz $interval > VCF1.vcf
tabix -h $gatk_data/1000G_phase1.indels.hg19.vcf.gz $interval > VCF2.vcf
tabix -h $gatk_data/dbsnp_137.hg19.vcf.gz $interval > interval.dbsnp_137.hg19.vcf
java -Xmx2g -jar gatk/MarkDuplicates.jar INPUT=$input OUTPUT=deduplicated.bam REMOVE_DUPLICATES=true METRICS_FILE=duplication.metrics
samtools index deduplicated.bam
java -Xmx2g -jar $gatk_jar \
-I deduplicated.bam \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T RealignerTargetCreator \
-o realigned.intervals \
--known VCF1.vcf \
--known VCF2.vcf \
-L $interval
java -Xmx2g -jar $gatk_jar \
-I deduplicated.bam \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T IndelRealigner \
-targetIntervals realigned.intervals \
-o realigned.bam \
-known VCF1.vcf \
-known VCF2.vcf \
--consensusDeterminationModel KNOWNS_ONLY \
-L $interval \
-LOD 0.4 \
--downsample_to_coverage 250 \
-compress 0
java -Xmx2g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-I realigned.bam \
-T PrintReads \
-o recalibrated.bam \
--disable_indel_quals \
-BQSR $bqsr \
-compress 0
if [[ $gatk_jar =~ GenomeAnalysisTKLite ]] ;
then
java -Xmx6g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T UnifiedGenotyper \
-I recalibrated.bam \
--dbsnp interval.dbsnp_137.hg19.vcf \
-o raw.vcf \
-glm BOTH \
-L $interval \
-stand_call_conf 30.0 \
-stand_emit_conf 30.0 \
-nct $cpu_cores \
-rf BadCigar
else
#full featured GATK, run HaplotypeCaller
java -Xmx6g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T HaplotypeCaller \
-I recalibrated.bam \
--dbsnp interval.dbsnp_137.hg19.vcf \
-stand_call_conf 30.0 \
-stand_emit_conf 30.0 \
-o raw.hc.vcf \
-L $interval \
-nct $cpu_cores \
-rf BadCigar
java -Xmx6g -jar $gatk_jar \
-R $gatk_data/hg19/ucsc.hg19.fasta \
-T VariantAnnotator \
-I recalibrated.bam \
--variant raw.hc.vcf \
-A MappingQualityZero \
--dbsnp interval.dbsnp_137.hg19.vcf \
-o raw.vcf \
-L $interval \
-nt $cpu_cores \
-rf BadCigar
fi
./swe emit file raw.vcf
NGS pipeline (BWA-MEM+GATK) by pushkarev
Introduction
After numerous requests we decided to open-source our implementation of whole genome analysis pipeline in order to share some of the best practices of running bioinformatics workflows on AWS.Current pipeline run Best Practices GATK workflow on Illumina paired-end data and is designed to work with hg19 reference. Pipeline uses Cirrus for task scheduler and S3 for storing intermediate files. Cirrus users can use the pipeline as is, or use it a reference to build their own workflows.
It is also possible to run the pipeline with other schedulers (such as SGE/StarCluster) and other filesystems (e.g. shared NFS), however it will require replacing ksub with appropriate call (e.g. qsub) and nmodifying SWE wrapper to work with your specific file system.
We present this pipeline for educational purposes only, and discourage it's use in testing or diagnostics without proper testing.
This document represents a rather incomplete description of the pipeline, and I will be gradually updating it as time permits.
GATK Best Practices Pipeline
Currenly we expect paired end FASTQ as input and include following steps:- Sequence Alignment with bwa mem
- Alignment sorting
- Base Quality Recalibration
- Indel realignment to refine local alignments
- Variant calling (Unified Genotyper for GATK Lite and HaplotpeCaller othwerise)
- Variant score recalibration
- Publishing to S3
Please edit general_settings.sh to point to your S3 bucket, make sure that your input files and hg19 reference is available on S3, and start sample exome workflow:
bash-3.2$ kauth --server my.cirrus.scheduler.com --login login --password passwd
bash-3.2$ bash gatk_pipeline.sh settings/exome-30x.settings 2>/dev/null
Will be using chr22 for base quality recalibration.
OK, job 343 is in batch mode with batch size 13
Processing paired[s3://gapp-east/sample/gcat_set_025_1.fastq.gz,s3://gapp-east/sample/gcat_set_025_2.fastq.gz], will split it in 2 chunks
Processing chromosome chr22
...
Processing chromosome chrM
Commiting all tasks to job 343
OK, 198 tasks from job 343 were sent to server
OK, commit of 198 tasks succeeded
bash-3.2$
After completion results will be available in S3, e.g.:bash-3.2$ es3 ls s3://gapp-east/sample/GCAT_150X_Exome.UG/
1424210533 15361199 s3://gapp-east/sample/GCAT_150X_Exome.UG/recalibrated.filtered.vcf.gz
Performance
When run at scale (>3 genomes at once) using Cirrus with resource prediction enbled, AWS spot instances, default automatic scaling and using S3 for storing intermediate files we usually observe runtimes <3 hours per whole human genome (40x) and compute cost rangind from $3-$5 based on whether UnifiedGenotyper or HaplotypeCaller is used, and scales to 100-1000 of whole genomes processed at the same time.Using other schedulers will probably yeld worse but comparable results, especially if there are performance bottlenecks such as shared NFS server.
Accuracy
We've run sample Illumina datasets from BioPlantet GCAT test that compares variants with Genome in a bottle (GiaB) reference. VCF files files are available below, let us know if your pipeline performs better!- Illumina-paired-end-100bp-30x HaplotypeCaller
- Illumina-paired-end-100bp-30x UnifiedGenotyper
- Illumina-paired-end-100bp-150x HaplotypeCaller
- Illumina-paired-end-100bp-150x UnifiedGenotyper
Detailed explanation
Entire worklow is submitted to scheduler as a directed acyclinc graph (DAG) of tasks with ksub. gatk_pipeline.sh is used to construct and sumbit task graph for execution. We briefly cover key sections of gatk_pipeline.sh below:Split input files
For each input file compute number of splits and submit split task to the queue for each input pair. Split task returns task id, and after completion will emit a number of FASTQ files, that can be referred via SWE as $split_job_id:1.fastq.gz, ..., $split_job_id:N.fastq.gzfor input in $INPUT_FASTQ
do
...
splits=$[$file1_size/$input_split_size+1]
split_job_id=$(ksub \
-v splits=$splits \
-v input1=$file1 \
-v input2=$file2 \
--wrap="bash split_fastq/split.sh")
...
done
Run BWA mem
For each individual split (e.g. 3.fastq.gz) we run alignment with BWA mem, and emit one alignment file per chromosome. (e.g. chr1.bam,...,chrX.bam). BWA tasks depends on corresponding on corresponding split task: for split in $(seq 1 $splits)
do
# align.sh: accepts input split file, produces alignment, split by chromosome
align_job_id=$(ksub \
-v sample_id=SAMPLE \
-v interleaved=$split_job_id:$split.fastq.gz \
-d $split_job_id \
--wrap="bash align/align.sh")
align_job_ids="$align_job_ids $align_job_id"
done
Combine individual alignments
Alignment from multiple alignment tasks are combined together in one file per chromosome with samtools (e.g. chr5.bam):
for chr in $CHROMOSOMES
do
#create comma separated list of alignment jobs
align_job_list=$(echo $align_job_ids |tr " " ",")
#create list of input alignment files for current chromosome
input_array=""
for align_job in $align_job_ids
do
input_array="$input_array $align_job:$chr.bam"
done
#submit combine jobs, and pass list of alignent files as input, and all alignment jobs as prerequsite
#combine.sh: accepts a list of aligned bam files, produced combined file for a given chromsome
# output: $combine_job_id:$chr.bam
combine_job_id=$(ksub \
-v chr=$chr \
-v input="$input_array" \
-d $align_job_list \
--wrap="bash combine/combine.sh" )
...
done
Run Base quality recalibration
GATK BQSR is usually run on one the chromosomes (chr22):for chr in $CHROMOSOMES
do
...
if [ "$chr" == "$BQSR_CHR" ]
then
#bqsr.sh: runs Base Quality recalibration on chr22
# output is $bqsr_job_id:bqsr.grp
bqsr_job_id=$(ksub \
-v chr=$chr \
-v input=$combine_job_id:$chr.bam \
-d $combine_job_id \
--wrap="bash bqsr/bqsr.sh")
fi
...
Split chromosome into sub-regions
This step first scans the entire chromosome to find 1kb regions that have no chance for containing variants - no repeats, no indels and almost all bases matching reference. Chromosome is then split into N subregions using some of these safe split points, in order to make sure that no short variants (<500bp) are be affected by the splits.for chr in $CHROMOSOMES
do
...
gatk_splits=$[$chr_size/$chr_split_size+1]
#chr_split.sh: finds genomic locations where it is safe to split a chromosomes
# returns list of bam files: 1.bam, 2.bam, ... , N.bam
chr_split_id=$(ksub \
-v input=$combine_job_id:$chr.bam \
-v splits=$gatk_splits \
-v chr=$chr \
-d $combine_job_id \
--wrap="bash split_chr/split_chr.sh")
...
done
Run variant calling
For each split we run variant calling that includes: 1. Picard Deduplication 2. Local indel realignment 3. Application of Base Quality Recalibration data 4. Variant calling with HaplotypeCaller or UnifiedGenotyper if GATK-Lite is provided.When UG is used, we also do additional annotation step to add MQ0 score to alignment which significantly improves quality of Variant recalibration in the next step.
for chr in $CHROMOSOMES
do
...
gatk_job_ids=""
for split_id in $(seq 1 $gatk_splits)
do
#gatk.sh runs gatk on a sub-interval and applies BQSR
#output: $gatk_job_id:raw.vcf
#submit GATKs with lower priority to let chr_splits finish faster
gatk_job_id=$(ksub \
-v input=$chr_split_id:$split_id.bam \
-v interval=$chr_split_id:$split_id.interval \
-v bqsr=$bqsr_job_id:bqsr.grp \ -d $chr_split_id,$bqsr_job_id \
--wrap="bash gatk/gatk.sh")
gatk_job_ids="$gatk_job_ids $gatk_job_id"
done
...
done
Combine partial VCFs into one
Multiple partial VCF files are combined into one.
input_array=""
for combine_vcf_job in $combine_vcf_job_ids
do
input_array="$input_array $combine_vcf_job:raw.vcf"
done
combine_vcf_job_id=$(ksub \
-v input="$input_array" \
-d $combine_vcf_job_list \
--wrap="bash combine_vcf/combine_vcf.sh" )
Run VQSR
Variant Quality Score Recalibration is performed separately for Indels and SNPS, variants are merged into final VCF file. #run variant quality recalibration
# output: $vqsr_job_id:recalibrated.filtered.vcf.gz
vqsr_job_id=$(ksub \
-v input=$combine_vcf_job_id:raw.vcf \
-v ANALYSIS=$ANALYSIS \
-d $combine_vcf_job_id \
--wrap="bash vqsr/vqsr.sh")
Publish results to S3
Final VCF file is saved on S3 using sample name as directory prefix. # save results to S3 and make them publicly accessible over HTTP
publish_job_id=$(ksub \
-v input=$vqsr_job_id:recalibrated.filtered.vcf.gz \
-v path="$SAMPLE_DATA/$NAME" \
-d $vqsr_job_id \
--wrap="bash publish/publish.sh")
SWE
to be described soonCirrus specific command lines
to be described soonIncluded 3rd party software
Coming soon.Down load the open-sourced pipeline here:
https://github.com/pushkarev/gatk-swe
Friday, February 13, 2015
Running igvtools from the Command Line
Downloading igvtools
igvtools_<version #>.zip includes the jar file and shell scripts for running igvtools, as well as the genome files.
igvtools_nogenomes_<version #>.zip includes the jar file and shell scripts and shell scripts for running igvtools.
igvtools_nogenomes_<version #>.zip includes the jar file and shell scripts and shell scripts for running igvtools.
Starting with shell scripts
The igvtools utilities can be invoked, with or without the graphical user interface (GUI), from one of the following scripts:
igvtools (command-line version for linux and Mac OS 10.x)
igvtools_gui (gui version for linux and Mac OS 10.x)
igvtools_gui (gui version for linux and Mac OS 10.x)
igvtools.bat (command-line version for windows)
igvtools_gui.bat (gui version for windows)
igvtools_gui.bat (gui version for windows)
The general form of the command-line version is:
igvtools [command] [options][arguments]
or
igvtools.bat [command] [options][arguments]
or
igvtools.bat [command] [options][arguments]
Recognized commands, options, arguments, and file types are described below.
Starting with java
Igvtools can also be started directly using Java. This option allows more control over Java parameters, such as the maximum memory to allocate. In this example, igvtools is started with 1500 MB of memory allocated:
java -Xmx1500m -Djava.awt.headless=true -jar igvtools.jar [command] [options][arguments]
To start with a GUI the command is
java -Xmx1500m -jar igvtools.jar -g
Memory settings
The scripts above allocate a fixed amount of memory. If this amount is not available on your platform you will get an error along the lines of "Could not start the Virtual Machine". If this happens you will need to edit the scripts to reduce the amount of memory requested, or use the Java startup option. The memory is set via a "-Xmx" parameter. For example -Xmx1500m requests 1500 MB, -Xmx1g requests 1 gigabyte.
Genome
The genome argument in the toTDF and count command can be either an id, or a full path to a .chrom.sizes or an IGV .genome file.
Commands
toTDF
The toTDF command converts a sorted data input file to a binary tiled data (.tdf) file. Use this command to pre-process large datasets for improved IGV performance.
Supported input file formats are: .wig, .cn, .snp, .igv, and .gct.
Note: This tool was previously known as "tile"
Usage:
igvtools toTDF [options] [inputFile] [outputFile] [genome]
Required arguments:
inputFile The input file (see supported formats below).
outputFile Binary output file. Must end in ".tdf".
Options:
-z num Specifies the maximum zoom level to precompute. The default
value is 7 and is sufficient for most files. To reduce file
size at the expense of IGV performance this value can be
reduced.
value is 7 and is sufficient for most files. To reduce file
size at the expense of IGV performance this value can be
reduced.
-f list A comma delimited list specifying window functions to use
when reducing the data to precomputed tiles. Possible
values are min, max, and mean. By default only the mean
is calculated.
when reducing the data to precomputed tiles. Possible
values are min, max, and mean. By default only the mean
is calculated.
-p file Specifies a "bed" file to be used to map probe identifiers
to locations. This option is useful when preprocessing . gct
files. The bed file should contain 4 columns:
chr start end name
where name is the probe name in the .gct file.
to locations. This option is useful when preprocessing . gct
files. The bed file should contain 4 columns:
chr start end name
where name is the probe name in the .gct file.
Example:
igvtools toTDF -z 5 copyNumberFile.cn copyNumberFile.tdf hg18
Notes:
Data file formats, with the exception of .gct files, must be sorted by start position. Files can be sorted with the sortcommand described below. Attempting to preprocess an unsorted file will result in an error.
Count
The count command computes average feature density over a specified window size across the genome. Common usages include computing coverage for alignment files and counting hits in Chip-seq experiments. By default, the resulting file will be displayed as a bar chart when loaded into IGV.
Supported input file formats are: .sam, .bam, .aligned, .psl, .pslx, and .bed.
Usage:
igvtools count [options] [inputFile] [outputFile] [genome]
Required arguments:
inputFile The input file (see supported formats above).
outputFile The output file, which can be binary "tdf" or ascii "wig format. The filename must. The filename must end in ".tdf" or ".wig", or be the special string "stdout". To indicate that you want to output both a .tdf and a .wig file, list both output filenames as a single string, separated by a comma with no other delimiters. If the output file is named "stdout" the output will be written to the standard output stream in wig format.
Options:
-z, --maxZoom num
Specifies the maximum zoom level to precompute.
-w, --windowSize num
The window size over which coverage is averaged. Defaults to 25 bp.
-e, --extFactor num
The read or feature is extended by the specified distance in bp prior to counting. This option is useful for chip-seq and rna-seq applications. The value is generally set to the average fragment length of the library minus the average read length.
--preExtFactor num
The read is extended upstream from the 5' end by the specified distance.
--postExtFactor num
Effectively overrides the read length, defines the downstream extent from the 5' end. Intended for use with preExtFactor.
-f, --windowFunctions list
A comma delimited list specifying window functions to use when reducing the data to precomputed tiles. Possible values are min, max, mean, median, p2, p10, p90, and p98. The "p" values represent percentile, so p2=2nd percentile, etc.
--strands [arg]
By default, counting is combined among both strands. This setting outputs the count for each strand separately. Legal argument values are 'read' or 'first'. 'read' Separates count by 'read' strand, 'first' uses the first in pair strand". Results are saved in a separate column for .wig output, and a separate track for TDF output.
--bases
Count the occurrence of each base (A,G,C,T,N). Takes no arguments. Results are saved in a separate column for .wig output, and a separate track for TDF output.
--query [querystring]
Only count a specific region. Query string has syntax <chr>:<start>-<end>. e.g. chr1:100-1000. Input file must be indexed.
--minMapQuality [mqual]
Set the minimum mapping quality of reads to include. Default is 0.
--includeDuplicates
Include duplicate alignments in count. Default false. If this flag is included, duplicates are counted. Takes no arguments
--pairs
Compute coverage from paired alignments counting the entire insert as covered. When using this option only reads marked "proper pairs" are used.
Notes:
The input file must be sorted by start position. See the sort command below.
Example:
igvtools count -z 5 -w 25 -e 250 alignments.bam alignments.cov.tdf hg18
igvtools count -z 5 -w 25 -e 250 alignments.bam alignments.cov.tdf hg18
Index
Creates an index for an alignment or feature file. Index files are required for loading alignment files into IGV, and can significantly improve performance for large feature files. Note that the index file is not directly loaded into IGV. Rather, IGV looks for the index file when the alignment or feature file is loaded. This command does not take an output file argument. Instead, the filename is generated by appending ".sai" (for alignments) or ".idx" (for features) to the input filename as IGV relies on this naming convention to find the index . The input file must be sorted by start position (see sort command, below).
Supported input file formats are: .sam, .aligned, .vcf, .psl, and .bed.
NOTES:
- This command will not index a binary (BAM) file. Use the samtools package to sort and index BAM files.
- The "sai" index is an IGV format, it does not work with samtools or any other application.
Usage:
igvtools index [inputFile]
Sort
Sorts the input file by start position, as required.
Supported input file formats are: .cn, .igv, .sam, .aligned, .psl, .bed, and .vcf.
NOTE: This command does not work with BAM files. The samtools package can be used to sort .bam files.
Usage:
igvtools sort [options] [inputFile] [outputFile]
Required arguments:
inputFile
outputFile
The special string "stdout" can be used as [outputFile], in which case the output will
be written to the standard output stream instead of a file.
be written to the standard output stream instead of a file.
Options:
-t tmpdir Specify a temporary working directory. For large input files
this directory will be used to store intermediate results of
the sort. The default is the users temp directory.
this directory will be used to store intermediate results of
the sort. The default is the users temp directory.
-m maxRecords The maximum number of records to keep in memory during the
sort. The default value is 500000. Increase this number
if you receive "too many open files" errors. Decrease it
if you experience "out of memory" errors.
sort. The default value is 500000. Increase this number
if you receive "too many open files" errors. Decrease it
if you experience "out of memory" errors.
Version
Prints the igvtools version number to the console.
http://www.broadinstitute.org/igv/igvtools_commandline
http://www.broadinstitute.org/igv/igvtools_commandline
Thursday, February 5, 2015
SSH tunnel: Access your MySQL server anywhere
Establish the SSH tunnel:
ssh -L 3333:mysql.server:3306 geoffham@ssh.pythonanywhere.com
Access your MySQL server like local:
ssh -L 3333:mysql.server:3306 geoffham@ssh.pythonanywhere.com
Access your MySQL server like local:
mysql -h 127.0.0.1 -u geoffham -p
Monday, February 2, 2015
TAU: Tuning and Analysis Utilities
TAU Performance System®
is a portable profiling and tracing toolkit for performance analysis of parallel programs written in Fortran,
C, C++, UPC, Java, Python.
TAU (Tuning and Analysis Utilities) is capable of gathering performance information through instrumentation of functions, methods, basic blocks, and statements as well as event-based sampling. All C++ language features are supported including templates and namespaces. The API also provides selection of profiling groups for organizing and controlling instrumentation. The instrumentation can be inserted in the source code using an automatic instrumentor tool based on the Program Database Toolkit (PDT), dynamically using DyninstAPI, at runtime in the Java Virtual Machine, or manually using the instrumentation API.
TAU's profile visualization tool, paraprof, provides graphical displays of all the performance analysis results, in aggregate and single node/context/thread forms. The user can quickly identify sources of performance bottlenecks in the application using the graphical interface. In addition, TAU can generate event traces that can be displayed with the Vampir, Paraver or JumpShot trace visualization tools.
Resources
An online book for TAU installation:
https://www.cs.uoregon.edu/research/tau/docs/old/index.html
A brief tutorial on TAU profiling from Cornell:
https://www.cac.cornell.edu/vw/Profiling/tauProfiling.aspx
TAU (Tuning and Analysis Utilities) is capable of gathering performance information through instrumentation of functions, methods, basic blocks, and statements as well as event-based sampling. All C++ language features are supported including templates and namespaces. The API also provides selection of profiling groups for organizing and controlling instrumentation. The instrumentation can be inserted in the source code using an automatic instrumentor tool based on the Program Database Toolkit (PDT), dynamically using DyninstAPI, at runtime in the Java Virtual Machine, or manually using the instrumentation API.
TAU's profile visualization tool, paraprof, provides graphical displays of all the performance analysis results, in aggregate and single node/context/thread forms. The user can quickly identify sources of performance bottlenecks in the application using the graphical interface. In addition, TAU can generate event traces that can be displayed with the Vampir, Paraver or JumpShot trace visualization tools.
Resources
An online book for TAU installation:
https://www.cs.uoregon.edu/research/tau/docs/old/index.html
A brief tutorial on TAU profiling from Cornell:
https://www.cac.cornell.edu/vw/Profiling/tauProfiling.aspx
2015 International Summer School on HPC Challenges in Computational Sciences
Graduate students and postdoctoral scholars from institutions in Canada, Europe, Japan and the United States are invited to apply for the sixth International Summer School on HPC Challenges in Computational Sciences, to be held June 21-26, 2015, in Toronto, Canada.
Applications are due March 11, 2015. The summer school is sponsored by Compute/Calcul Canada, the Extreme Science and Engineering Discovery Environment (XSEDE) with funds from the U.S. National Science Foundation, the Partnership for Advanced Computing in Europe (PRACE) and the RIKEN Advanced Institute for Computational Science (RIKEN AICS) in Japan.
Leading American, European and Japanese computational scientists and HPC technologists will offer instruction on a variety of topics, including:
- HPC challenges by discipline (e.g, earth, life and materials sciences, physics)
- HPC programming proficiencies
- Performance analysis & profiling
- Algorithmic approaches & numerical libraries
- Data-intensive computing
- Scientific visualization
- Canadian, EU, Japanese and U.S. HPC-infrastructures
The expense-paid program will benefit advanced scholars from Canadian, European, Japanese and U.S. institutions who use HPC to conduct research. Interested students should apply by March 11, 2015. Meals, housing, and travel will be covered for the selected participants. Applications from graduate students and postdocs in all science and engineering fields are welcome. Preference will be given to applicants with parallel programming experience, and a research plan that will benefit from the utilization of high performance computing systems.
Further information and application:
https://ihpcss2015.computecanada.ca
Contacts:
Compute Canada:
Jonathan Ferland
Compute Canada / Calcul Canada
jonathan.ferland@computecanada.ca
PRACE:
Hermann Lederer
RZG, Max Planck Society, Germany
Email: lederer@rzg.mpg.de
Simon Wong
ICHEC, Ireland
Email: simon.wong@ichec.ie
RIKEN:
Mitsuhisa Sato
AICS, RIKEN
msato@riken.jp
XSEDE:
Scott Lathrop
NCSA, University of Illinois at Urbana-Champaign, United States
Email: lathrop@illinois.edu
About Compute Canada / Calcul Canada:
Compute Canada / Calcul Canada (CC) provides Canadian researchers with a national platform for advanced computing. Working with research institutions and regional organizations across the country, CC provides a wide range of computing and data resources, services, and expertise to advance scientific knowledge and innovation across multiple disciplines and sectors. For more information, see www.computecanada.ca
About PRACE:
The Partnership for Advanced Computing in Europe (PRACE) is an international non-profit association with its seat in Brussels. The PRACE Research Infrastructure provides a persistent world-class high performance computing service for scientists and researchers from academia and industry in Europe. The computer systems and their operations accessible through PRACE are provided by 4 PRACE members (BSC representing Spain, CINECA representing Italy, GCS representing Germany and GENCI representing France). The Implementation Phase of PRACE receives funding from the EU's Seventh Framework Programme (FP7/2007-2013) under grant agreement RI-312763. For more information, see www.prace-ri.eu
About RIKEN AICS:
RIKEN is one of Japan's largest research organizations with institutes and centers in locations throughout Japan. The Advanced Institute for Computational Science (AICS) strives to create an international center of excellence dedicated to generating world-leading results through the use of its world-class supercomputer "K computer." It serves as the core of the "innovative high-performance computer infrastructure" project promoted by the Ministry of Education, Culture, Sports, Science and Technology. http://www.aics.riken.jp/en/
About XSEDE:
The Extreme Science and Engineering Discovery Environment (XSEDE) is the most advanced, powerful, and robust collection of integrated digital resources and services in the world. It is a single virtual system that scientists can use to interactively share computing resources, data, and expertise. XSEDE accelerates scientific discovery by enhancing the productivity of researchers, engineers, and scholars by deepening and extending the use of XSEDE¹s ecosystem of advanced digital services and by advancing and sustaining the XSEDE advanced digital infrastructure. XSEDE is a five-year, $121-million project and is supported by the National Science Foundation. For more information, see www.xsede.org.
Sunday, February 1, 2015
An Introduction to Security Onion
Overview
Security Onion is a Linux distro for IDS, NSM, and log management.Network Security Monitoring (NSM) is, put simply, monitoring your network for security related events. It might be proactive, when used to identify vulnerabilities or expiring SSL certificates, or it might be reactive, such as in incident response and network forensics. Whether you’re tracking an adversary or trying to keep malware at bay, NSM provides context, intelligence and situational awareness of your network. There are some commercial solutions that get close to what Security Onion provides, but very few contain the vast capabilities of Security Onion in one package.
Many assume NSM is a solution they can buy to fill a gap; purchase and deploy solution XYZ and problem solved. The belief that you can buy an NSM denies the fact that the most important word in the NSM acronym is “M” for Monitoring. Data can be collected and analyzed, but not all malicious activity looks malicious at first glance. While automation and correlation can enhance intelligence and assist in the process of sorting through false positives and malicious indicators, there is no replacement for human intelligence and awareness. I don’t want to disillusion you. Security Onion isn’t a silver bullet that you can setup, walk away from and feel safe. Nothing is and if that’s what you’re looking for you’ll never find it. Security Onion will provide visibility into your network traffic and context around alerts and anomalous events, but it requires a commitment from you the administrator or analyst to review alerts, monitor the network activity, and most importantly, have a willingness, passion and desire to learn.
Core Components
Security Onion seamlessly weaves together three core functions: full packet capture, network-based and host-based intrusion detection intrusion detection systems (NIDS and HIDS, respectively), and powerful analysis tools.Full-packet capture is accomplished via netsniff-ng (http://netsniff-ng.org/), “the packet sniffing beast”. netsniff-ng captures all the traffic your Security Onion sensors see and stores as much of it as your storage solution will hold (Security Onion has a built-in mechanism to purge old data before your disks fill to capacity). Full packet capture is like a video camera for your network, but better because not only can it tell us who came and went, but also exactly where they went and what they brought or took with them (exploit payloads, phishing emails, file exfiltration). It’s a crime scene recorder that can tell us a lot about the victim and the white chalk outline of a compromised host on the ground. There is certainly valuable evidence to be found on the victim’s body, but evidence at the host can be destroyed or manipulated; the camera doesn't lie, is hard to deceive, and can capture a bullet in transit.
Network-based and host-based intrusion detection systems (IDS) analyze network traffic or host systems, respectively, and provide log and alert data for detected events and activity. Security Onion provides multiple IDS options:
NIDS:
- Rule-driven NIDS. For rule-driven network intrusion detection, Security Onion offers the choice of Snort (http://snort.org/) or Suricata (http://suricata-ids.org/). Rule-based systems look at network traffic for fingerprints and identifiers that match known malicious, anomalous or otherwise suspicious traffic. You might say that they’re akin to antivirus signatures for the network, but they’re a bit deeper and more flexible than that.
- Analysis-driven NIDS. For analysis-driven network intrusion detection, Security Onion offers The Bro Network Security Monitor, also known as Bro IDS (http://bro-ids.org/). Bro is developed and maintained by the International Computer Science Institute at the University of California at Berkeley and supported with National Science Foundation funding. Unlike rule-based systems that look for needles in the haystack of data, Bro says, “Here’s all your data and this is what I’ve seen. Do with it what you will and here’s a framework so you can.” Bro monitors network activity and logs any connections, DNS requests, detected network services and software, SSL certificates, and HTTP, FTP, IRC SMTP, SSH, SSL, and Syslog activity that it sees, providing a real depth and visibility into the context of data and events on your network. Additionally, Bro includes analyzers for many common protocols and by default has the capacity to check MD5 sums for HTTP file downloads against Team Cymru’s Malware Hash Registry project.
Beyond logging activity and traffic analyzers, the Bro framework provides a very extensible way to analyze network data in real time. Recent integration with REN-ISAC’s Collective Intelligence Framework (CIF https://code.google.com/p/collective-intelligence-framework/) provides real-time correlation of network activity with up-to-date community intelligence feeds to alert when users access known malicious IPs, domains or URLs. The input framework allows you to feed data into Bro, which can be scripted, for example, to read a comma delimited file of C-level employee usernames and correlate that against other activity, such as when they download an executable file from the Internet. The file analysis framework provides protocol independent file analysis, allowing you to capture files as they pass through your network and automatically pass them to a sandbox or a file share for antivirus scanning. The flexibility of Bro makes it an incredibly powerful ally in your defense.HIDS:
- For host-based intrusion detection, Security Onion offers OSSEC (http://www.ossec.net/), a free, open source HIDS for Windows, Linux and Mac OS X. When you add the OSSEC agent to endpoints on your network, you gain invaluable visibility from endpoint to your network’s exit point. OSSEC performs log analysis, file integrity checking, policy monitoring, rootkit detection, real-time alerting and active response. Originally created by Daniel Cid, Trend-Micro acquired OSSEC in 2009 and has continued to offer it as an open source solution. As an analyst, being able to correlate host-based events with network-based events can be the difference in identifying a successful attack.
Analysis Tools
With full packet capture, IDS logs and Bro data, there is a daunting amount of data available at the analyst’s fingertips. Fortunately, Security Onion integrates the following tools to help make sense of this data:- Sguil (http://sguil.sourceforge.net/), created by Bamm Visscher (@bammv), is “The Analyst Console for Network Security Monitoring.” It is the analyst’s right hand, providing visibility into the event data being collected and the context to validate the detection. Sguil provides a single GUI (written in tcl/tk) in which to view Snort or Suricata alerts, OSSEC alerts, Bro HTTP events, and Passive Real-Time Asset Detection System (PRADS) alerts. More importantly, Sguil allows you to “pivot” directly from an alert into a packet capture (via Wireshark or NetworkMiner) or a transcript of the full session that triggered the alert. So, instead of seeing only an individual packet associated with an alert and being left with the unanswerable question, “What now?” or “What happened next?,” you can view all of the associated traffic and actually answer that question. Additionally, Sguil allows the analyst to query all packets captured, not just those involved with an alert, so you can correlate traffic that may not have triggered any alerts but still could be associated with malicious or undesired activity. Lastly, Sguil allows the analyst to conduct reverse DNS and whois lookups of IP addresses associated with alerts.
Sguil differs from other alert interfaces in that it allows collaboration among analysts by allowing alerts to be commented on and escalated to more senior analysts who can take action on the alerts. Sguil is the primary Security Onion tool to provide the most context around a given alert.
- Squert (http://www.squertproject.org/), created by Paul Halliday (@01110000), is a web application interface to the Sguil database. Although it is neither meant to be a real-time (or near real-time) interface nor a replacement for Sguil, it allows querying of the Sguil database and provides several visualization options for the data such as “time series representations, weighted and logically grouped result sets” and geo-IP mapping.
- Snorby (https://snorby.org/), created by Dustin Webber (@Mephux), is a web application interface to view, search and classify Snort and Suricata alerts and generate various types of reports, such as most active IDS signatures, most active sensors, and top source and destination IP addresses. With the help of the capME! plugin, it also allows the analyst to pivot into a transcript of the session that contains the packet which triggered an alert, rather than only being able to see the single packet that triggered it (similar to viewing the transcript in Sguil).
- Enterprise Log Search and Archive (ELSA https://code.google.com/p/enterprise-log-search-and-archive/), created by Martin Holste (@mcholste), is “a centralized syslog framework built on Syslog-NG, MySQL, and Sphinx full-text search. It provides a fully asynchronous web-based query interface that normalizes logs and makes searching billions of them for arbitrary strings as easy and fast as searching the web. It also includes tools for assigning permissions for viewing the logs as well as email based alerts, scheduled queries, and graphing.” In plain English, ELSA is a powerhouse search tool that allows you to effortlessly comb through most all of the data collected by Security Onion as well as any additional syslog sources that you forward to it, giving you visibility into any relevant syslog data you can send to ELSA. Additionally, ELSA offers powerful charting and graphing Dashboards via the Google Visualization API. It’s difficult to understand ELSA’s power without seeing it action (which you can do at http://www.youtube.com/watch?v=INRJZ3_Dsyc).
Deployment Scenarios
Security Onion is built on a distributed client-server model. A Security Onion “sensor” is the client and a Security Onion “server” is, well, the server. The server and sensor components can be run on a single physical machine or virtual machine, or multiple sensors can be distributed throughout an infrastructure and configured to report back to a designated server. An analyst connects to the server from a client workstation (typically a Security Onion virtual machine installation) to execute queries and retrieve data.The following are the three Security Onion deployment scenarios:
- Standalone: A standalone installation consists of a single physical or virtual machine running both the server and sensor components and related processes. A standalone installation can have multiple network interfaces monitoring different network segments. A standalone installation is the easiest and most convenient method to monitor a network or networks that are accessible from a single location.
- Server-sensor: A server-sensor installation consists of a single machine running the server component with one or more separate machines running the sensor component and reporting back to the server. The sensors run all of the sniffing processes and store the associated packet captures, IDS alerts, and databases for Sguil; Snorby and ELSA. The analyst connects to the server from a separate client machine and all queries sent to the server are distributed to the appropriate sensor(s), with the requested information being directed back to the client. This model reduces network traffic by keeping the bulk of the collected data on the sensors until requested by the analyst’s client. All traffic between the server and sensors and client and server are protected with SSH encrypted tunnels.
- Hybrid: A hybrid installation consists of a standalone installation that also has one or more separate sensors reporting back to the server component of the standalone machine.
Conclusion
So we have full packet capture, Snort or Suricata rule-driven intrusion detection, Bro event-driven intrusion detection and OSSEC host-based intrusion detection, all running out of the box once you run Security Onion setup. These disparate systems with various dependencies and complexities all run seamlessly together and would otherwise take hours, days or weeks to assemble and integrate on their own. What was once a seemingly impossible task is now as easy to install as Windows.Security Onion blog http://blog.securityonion.net/p/securityonion.html
Subscribe to:
Posts (Atom)