Page tree

Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

Consider this multi-sample VCF input record at chromosome 1 position 100. It lists four samples with their genotypes being; homozygous reference [AA/AA], heterozygous SNP [AA/AT], heterozygous insertion [AT/AAC] and heterozygous deletion [AA/A]: 

#CHROM POS    REF    ALT       FORMAT  SAMPLE1       SAMPLE2        SAMPLE3        SAMPLE4
1 100 AA AT,AAC,A AT:AD 0/0:40,1,0,0 0/1:19,20,1,0 2/1:0:20,22,0 0/3:19,0,0,20

The OpenCB Variant Normalization process normalises this first splits the record into four individual variants. The

#CHROM POS    REF    ALT       
1 100  AA     AT
1   100    AA     AAC
1 100 AA A

Each variant is then allele trimmed and positions updated;

#CHROM POS    REF    ALT       
1 101  A      T
1   102    -     C
1 100 A -

The final JSON representation of the Variant objects as stored in the OpenCGA database is as follows:

{
"id" : "1:
100
101:A:
-
T",
"chromosome" : "1",
"start" :
100
101,
"end" :
100
101,
"reference" : "A",
"alternate" : "T",
"type" : "
INDEL
SNV",
"studies" : [ {
"files" : [ {
"call" : {
"variantId" : "1:100:AA:AT,AAC,A",
"alleleIndex" :
2
0
}
} ],
"secondaryAlternates" : [ {
"chromosome" : "1",
"start" :
101
102,
"end" : 101,
"reference" : "
A
",
"alternate" : "
T
C",
"type" : "
SNV
INDEL"
}, {
"chromosome" : "1",
"start" :
102
100,
"end" :
101
100,
"reference" : "A",
"alternate" : "
C
",
"type" : "INDEL"
} ],
"sampleDataKeys" : [ "GT", "AD" ],
"samples" : [ {
"sampleId" : "SAMPLE1",
"data" : [ "0/0", "40,1,0,
1,
0" ]
}, {
"sampleId" : "SAMPLE2",
"data" : [ "0/
2
1", "19,
0,
20,1,0" ]
}, {
"sampleId" : "SAMPLE3",
"data" : [ "
3
2/
2
1", "0:20,22,0,
22,
0" ]
}, {
"sampleId" : "SAMPLE4",
"data" : [ "0/
1
3", "19,
20
0,0,
0
20" ]
} ]
} ]
}
{
"id" : "1:
101
102:
A
-:
T
C",
"chromosome" : "1",
"start" :
101
102,
"end" : 101
,
"reference" : "A"
,
"alternate" : "
T
C",
"type" : "
SNV
INDEL",
"studies" : [ {
"files" : [ {
"call" : {
"variantId" : "1:100:AA:AT,AAC,A",
"alleleIndex" :
0
1
}
} ],
"secondaryAlternates" : [ {
"chromosome" : "1",
"start" :
102
101,
"end" : 101,
"reference" : "A",
"alternate" : "
C
T",
"type" : "
INDEL
SNV"
}, {
"chromosome" : "1",
"start" : 100,
"end" : 100,
"reference" : "A",
"alternate" : "",
"type" : "INDEL"
} ],
"sampleDataKeys" : [ "GT", "AD" ],
"samples" : [ {
"sampleId" : "SAMPLE1",
"data" : [ "0/0", "40,0,1
,0
,0" ]
}, {
"sampleId" : "SAMPLE2",
"data" : [ "0/
1
2", "19,1,20
,1
,0" ]
}, {
"sampleId" : "SAMPLE3",
"data" : [ "1/2
/1
", "0:20,0,22,0
,0
" ]
}, {
"sampleId" : "SAMPLE4",
"data" : [ "0/3", "19,0,0,20" ]
} ]
} ]
}
{
"id" : "1:
102
100:A:-
:C
",
"chromosome" : "1",
"start" :
102
100,
"end" : 101,
"reference" : "A",
"alternate" : "
C
",
"type" : "INDEL",
"studies" : [ {
"files" : [ {
"call" : {
"variantId" : "1:100:AA:AT,AAC,A",
"alleleIndex" :
1
2
}
} ],
"secondaryAlternates" : [ {
"chromosome" : "1",
"start" : 101,
"end" : 101,
"reference" : "A",
"alternate" : "T",
"type" : "SNV"
}, {
"chromosome" : "1",
"start" :
100
102,
"end" :
100
101,
"reference" : "
A
",
"alternate" : "C",
"type" : "INDEL"
} ],
"sampleDataKeys" : [ "GT", "AD" ],
"samples" : [ {
"sampleId" : "SAMPLE1",
"data" : [ "0/0", "40,0,1,0" ]
}, {
"sampleId" : "SAMPLE2",
"data" : [ "0/2", "19,
1
0,20,
0
1" ]
}, {
"sampleId" : "SAMPLE3",
"data" : [ "
1
3/2", "0:20,0,22,0" ]
}, {
"sampleId" : "SAMPLE4",
"data" : [ "0/
3
1", "19,20,0,0
,20
" ]
} ]
} ]
}