#!/usr/bin/env python from suds.client import Client import os, sys import cPickle as pickle wsdlfile = "http://www.pdb.org/pdb/services/pdbws?wsdl" client = Client(wsdlfile) hot_pickle_filename = 'find_hot_structures.pickle' if os.path.exists(hot_pickle_filename): hot_pickle = open(hot_pickle_filename) results = pickle.load(hot_pickle) hot_pickle.close() else: results = client.service.getGenomeDetails() hot_pickle = open(hot_pickle_filename, 'w') pickle.dump(results, hot_pickle) hot_pickle.close() pdb_gene_list = [] for result in results: pdb_gene_id = int(result.split()[2]) if pdb_gene_id != 0: if pdb_gene_id not in pdb_gene_list: pdb_gene_list.append(pdb_gene_id) pubmed_gene_list = [] pubmed_gene_pubs = [] for line in open('gene2pubmed_sorted_by_max_refs.out'): words = line.split() pubmed_gene_pub = int(words[0]) pubmed_gene = int(words[1]) pubmed_gene_pubs.append(pubmed_gene_pub) pubmed_gene_list.append(pubmed_gene) no_structures = set(pubmed_gene_list) - set(pdb_gene_list) #structures = set(pubmed_gene_list) & set(pdb_gene_list) for count, gene, in zip(pubmed_gene_pubs, pubmed_gene_list): if count < 50: continue if gene in no_structures: print count, gene