...
Code Block | ||||||||
---|---|---|---|---|---|---|---|---|
| ||||||||
# First, we need to import both the ConfigClient and the OpenCGAClient
from pyopencga.opencga_config import ConfigClient
from pyopencga.opencga_client import OpenCGAClient
# The main client-configuration.yml file has a 'host' section to point to the Rest OpenCGA endpoints.
# We need to either pass a the path to the configuration file or a dictionary with the format of the file.
config = ConfigClient('/opt/opencga/conf/client-configuration.yml')
# And finally create an instance of the OpenCGAClient passing the configuration
oc = OpenCGAClient(config)
# Now we need to authenticate.
oc.users.login('myUser') # If done this way, password will be prompted to the user so it is not displayed or..
oc.users.login('myUser', 'myPassword') # ... you can also log in sending the password as an additional parameter
# Let's assume our installation already has been populated and we are interested in looking for
# all the families containing a concrete disorder: 'Rod-cone dystrophy'. To fetch this data, we will need to:
family_query_response = oc.families.search(study="study1", limit=10, disorders="Rod-cone dystrophy", include="id,members.id")
# Running oc.families.search(disorders="Rod-cone dystrophy") with only the 'disorders' field would only work
# if only one project and one study has been defined. However, we expect that most of the OpenCGA installations
# will have more than one study, so we need to specify the families of which study we are looking for.
# Additionally, we are passing limit = 10 to limit the number of family results we want to fetch. Because this
# is an example, we are simply limiting the number of results to 10.
# Finally, if we don't specify anything else, all the values from the Family will be fetched. When writing
# scripts, we are normally interested in just a few fields of a whole entry, so adding the include/exclude fields
# will definitely help us getting the results faster as we will avoid sending data we are going to discard through
# the network. In this particular case, we are only interested in getting the Family id and the id of the members
# of the family. To know what fields you can include/exclude, please follow the data models we have defined.
# family_query_response is an instance of the QueryResponse class defined in the Python library. To read the fields,
# we could do the following:
family_query_response.time # Get the time spent with the REST call
family_query_response.apiVersion # Get the API version of the REST
family_query_response.queryOptions # Get the QueryOptions of the call (include/exclude, limit, skip, count...)
family_query_response.warning # Get warning messages
family_query_response.error # Get error messages
family_query_response.response # Get the response (Array of QueryResults containing the data queried)
# If we want to have direct access to the first QueryResult, we can run:
family_query_response.first() # same as family_query_response.response[0]
# Runnig this call, we should have obtained a list of 10 family results (if we have families defined with that disorder).
# If we were only interested in seeing the first of the families, we could use:
family_query_response.result() # same as family_query_response.response[0].result[0]
# Or if we want to iterate and check all the results, we could execute the following script:
for family in family_query_response.results():
print (family['id'])
# We could have this same behaviour if we run the following script, which is why 'results()' is that handy.
for query_result in family_query_response.response:
for family in query_result['result']:
print (family['id'])
# If we want to know exactly the amount of results obtained, we can run:
family_query_response.num_total_results()
# Now that we know how to work with the OpenCGA QueryResponse object, we will write a script to fetch all the variants
# falling in the 'BMPR2' gene found in any member of the family. In this case, we will limit the variant query to a maximum
# of 10 results excluding the sample information (sample information can be huge and would make this query much slower).
for family in oc.families.search(study="study1",limit=10,disorders="Rod-cone dystrophy",include="id").results():
print ("Family: " + family['id'])
variant_response = oc.variant.query(family=family['id'], gene= 'BMPR2', study='study1', includeSamples=None, limit=10)
if variant_response.num_total_results() > 0:
for variant in variant_response.results():
print (variant['chromosome'] + ":" + str(variant['start']) + "-" + str(variant['end']) + '\t' + variant['type'])
else:
print ("No variant results found")
print()
|
...