Bash useful operation

awk

awk '$4>1000' a.txt ##过滤第四列大于1000的，输出来这些行
awk '$1>1000 && $2=="mRNA" {print $1,$2,$3}' a.txt #过滤第一列大于1000，第二列等于mRNA，输出只带第一列，第二列，第三列的所有行
awk '$2~/^scaffold/' a.txt #输出第二列开头匹配scaffold的行

grep

!warning: grep -c can only count the number of times a string appears in each line
grep -o can count the appears time of one string.

for example, we want know: How many times is the restriction site “ATGC” found in the chromosome 1?
the chromosome 1 like this:

1
2
3

>chromosome 1  
ATGCATGCAAAA  
ATGCATGCAAAA

the result is 4

cut

cut -f1 -d " " a.txt #在不是以tab键隔开的文件中，用-d可以指定分隔符，如以空格隔开

统计一条fasta有多长

我的目录下有很多个fasta文件，我想看每一个fasta有多长：

for f in */*fsa
do
   grep -v ">" $f |tr -d "\n" |wc -L #grep -v 去掉带>的行
done

这样通常是不方便的，我们可以同时输出fasta的名字以及sequence的长度：

for i in */*.fsa
do
   NAME=`grep ">" $i | cut -f 1 -d " " | tr -d ">"`
   LENGTH=`grep -v ">" $i | tr -d "\n" | wc -c`
   echo "${NAME} ${LENGTH}" >> chromo_length.txt
done

统计populus目录下有多少个文件

for i in populus\*
do
   ls $i |wc -l
done