hchen725 commited on
Commit
6cfc5c4
1 Parent(s): be7ceb5

Update geneformer/tokenizer.py

Browse files
Files changed (1) hide show
  1. geneformer/tokenizer.py +8 -2
geneformer/tokenizer.py CHANGED
@@ -126,8 +126,11 @@ def sum_ensembl_ids(
126
  gene_ids_collapsed = [
127
  gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id
128
  ]
 
 
 
129
 
130
- if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed)):
131
  # Keep original Ensembl IDs as `ensembl_id_original`
132
  rename_attr(data.ra, "ensembl_id", "ensembl_id_original")
133
  data.ra["ensembl_id"] = gene_ids_collapsed
@@ -223,7 +226,10 @@ def sum_ensembl_ids(
223
  gene_ids_collapsed = [
224
  gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id
225
  ]
226
- if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed)):
 
 
 
227
  data.var.ensembl_id = data.var.ensembl_id.map(gene_mapping_dict)
228
  return data
229
 
 
126
  gene_ids_collapsed = [
127
  gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id
128
  ]
129
+ gene_ids_collapsed_in_dict = [
130
+ gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
131
+ ]
132
 
133
+ if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
134
  # Keep original Ensembl IDs as `ensembl_id_original`
135
  rename_attr(data.ra, "ensembl_id", "ensembl_id_original")
136
  data.ra["ensembl_id"] = gene_ids_collapsed
 
226
  gene_ids_collapsed = [
227
  gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id
228
  ]
229
+ gene_ids_collapsed_in_dict = [
230
+ gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
231
+ ]
232
+ if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
233
  data.var.ensembl_id = data.var.ensembl_id.map(gene_mapping_dict)
234
  return data
235