Data Manipulation

Data wrangling

# If file is not specified, the file /usr/share/dict/words is used.
look phy|head -n 10
# Phil
# Philadelphia
# Philadelphia's
# Philby
# Philby's
# Philip
# Philippe
# Philippe's
# Philippians
# Philippine

Repeat printing string n times (e.g. print 'hello world' five times)

printf 'hello world\n%.0s' {1..5}

Do not echo the trailing newline

username=`echo -n "bashoneliner"`

Copy a file to multiple files (e.g copy fileA to file(B-D))

tee <fileA fileB fileC fileD >/dev/null

Delete all non-printing characters

tr -dc '[:print:]' < filename

Remove newline / nextline

tr --delete '\n' <input.txt >output.txt

Replace newline

tr '\n' ' ' <filename

To uppercase/lowercase

tr /a-z/ /A-Z/

Translate a range of characters (e.g. substitute a-z into a)

echo 'something' |tr a-z a
# aaaaaaaaa

Compare two files (e.g. fileA, fileB)

diff fileA fileB
# a: added; d:delete; c:changed

# or
sdiff fileA fileB
# side-to-side merge of file differences

Compare two files, strip trailing carriage return/ nextline (e.g. fileA, fileB)

 diff fileA fileB --strip-trailing-cr

Number a file (e.g. fileA)

nl fileA

#or
nl -nrz fileA
# add leading zeros

#or
nl -w1 -s ' '
# making it simple, blank separate

Join two files field by field with tab (default join by the first column of both file, and default separator is space)

# fileA and fileB should have the same ordering of lines.
join -t '\t' fileA fileB

# Join using specified field (e.g. column 3 of fileA and column 5 of fileB)
join -1 3 -2 5 fileA fileB

Combine/ paste two or more files into columns (e.g. fileA, fileB, fileC)

paste fileA fileB fileC
# default tab separate

Group/combine rows into one row

# e.g.
# AAAA
# BBBB
# CCCC
# DDDD
cat filename|paste - -
# AAAABBBB
# CCCCDDDD
cat filename|paste - - - -
# AAAABBBBCCCCDDDD

Fastq to fasta (fastq and fasta are common file formats for bioinformatics sequence data)

cat file.fastq | paste - - - - | sed 's/^@/>/g'| cut -f1-2 | tr '\t' '\n' >file.fa

Reverse string

echo 12345| rev

Generate sequence 1-10

seq 10

Find average of input list/file of integers

i=`wc -l filename|cut -d ' ' -f1`; cat filename| echo "scale=2;(`paste -sd+`)/"$i|bc

Generate all combination (e.g. 1,2)

echo {1,2}{1,2}
# 1 1, 1 2, 2 1, 2 2

Generate all combination (e.g. A,T,C,G)

set = {A,T,C,G}
group= 5
for ((i=0; i<$group; i++));do
    repetition=$set$repetition;done
    bash -c "echo "$repetition""

Read file content to variable

foo=$(<test1)

Echo size of variable

echo ${#foo}

Echo a tab

echo -e ' \t '

Split file into smaller file

# Split by line (e.g. 1000 lines/smallfile)
split -d -l 1000 largefile.txt

# Split by byte without breaking lines across files
split -C 10 largefile.txt

Create a large amount of dummy files (e.g 100000 files, 10 bytes each):

#1. Create a big file
dd if=/dev/zero of=bigfile bs=1 count=1000000

#2. Split the big file to 100000 10-bytes files
 split -b 10 -a 10 bigfile

Rename all files (e.g. remove ABC from all .gz files)

rename 's/ABC//' *.gz

Remove file extension (e.g remove .gz from filename.gz)

basename filename.gz .gz

zcat filename.gz> $(basename filename.gz .gz).unpacked

Add file extension to all file(e.g add .txt)

rename s/$/.txt/ *
# You can use rename -n s/$/.txt/ * to check the result first, it will only print sth like this:
# rename(a, a.txt)
# rename(b, b.txt)
# rename(c, c.txt)

Squeeze repeat patterns (e.g. /t/t --> /t)

tr -s "/t" < filename

Do not print nextline with echo

echo -e 'text here \c'

View first 50 characters of file

head -c 50 file

Cut and get last column of a file

cat file|rev | cut -d/ -f1 | rev

Add one to variable/increment/ i++ a numeric variable (e.g. $var)

((var++))
# or
var=$((var+1))

Cut the last column

cat filename|rev|cut -f1|rev

Cat to a file

cat >myfile
let me add sth here
exit by control + c
^C

Clear the contents of a file (e.g. filename)

>filename

Append to file (e.g. hihi)

echo 'hihi' >>filename

Working with json data

#install the useful jq package
#sudo apt-get install jq
#e.g. to get all the values of the 'url' key, simply pipe the json to the following jq command(you can use .[]. to select inner json, i.e jq '.[].url')
cat file.json | jq '.url'

Decimal to Binary (e.g get binary of 5)

D2B=({0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1})
echo -e ${D2B[5]}
#00000101
echo -e ${D2B[255]}
#11111111

Wrap each input line to fit in specified width (e.g 4 integers per line)

echo "00110010101110001101" | fold -w4
# 0011
# 0010
# 1011
# 1000
# 1101

Sort a file by column and keep the original order

sort -k3,3 -s

Right align a column (right align the 2nd column)

cat file.txt|rev|column -t|rev

To both view and store the output

echo 'hihihihi' | tee outputfile.txt
# use '-a' with tee to append to file.

Show non-printing (Ctrl) characters with cat

cat -v filename

Convert tab to space

expand filename

Convert space to tab

unexpand filename

Display file in octal ( you can also use od to display hexadecimal, decimal, etc)

od filename

Reverse cat a file

tac filename

Reverse the result from uniq -c

while read a b; do yes $b |head -n $a ;done <test.txt

Last updated