Skip to content

Documentation of the main classes

1. Setup Data Folder

Module to set up a data directory with a predefined structure.

This module provides the DataFolderSetup class, which creates a directory structure for a data folder. The structure includes nodes and relationships folders with specified subfolders.

Classes:

Name Description
DataFolderSetup

Class to set up a data directory with a predefined structure.

Functions:

Name Description
main

Main function to set up the data directory.

SetupDataFolder

Class to set up a data directory with a predefined structure.

Attributes:

Name Type Description
data_folder str

The name of the data folder.

base_path str

The base path for the data directory.

structure dict

The structure of directories to create.

Source code in chemgraphbuilder/setup_data_folder.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class SetupDataFolder:
    """
    Class to set up a data directory with a predefined structure.

    Attributes:
        data_folder (str): The name of the data folder.
        base_path (str): The base path for the data directory.
        structure (dict): The structure of directories to create.
    """

    def __init__(self):
        """
        Initializes the DataFolderSetup with the data folder name and directory structure.
        """
        self.data_folder = "Data"
        self.base_path = os.path.join(os.getcwd(), self.data_folder)
        self.structure = {
            "Nodes": ["Compound_Properties"],
            "Relationships": [
                "Assay_Compound_Relationship",
                "Compound_Similarities",
                "Cpd_Cpd_CoOccurrence",
                "Cpd_Gene_CoOccurrence",
                "Compound_Gene_Relationship",
                "Assay_Compound_Relationship_Processed"
            ]
        }

    @staticmethod
    def create_folder(path):
        """
        Creates a folder if it does not already exist.

        Args:
            path (str): The path of the folder to create.
        """
        if not os.path.exists(path):
            os.makedirs(path)
            print(f"Created folder: {path}")
        else:
            print(f"Folder already exists: {path}")

    def setup(self):
        """
        Sets up the data directory structure based on the predefined structure.
        """
        # Create the base data directory
        self.create_folder(self.base_path)

        # Create the 'Nodes' directory and its subdirectories
        nodes_path = os.path.join(self.base_path, "Nodes")
        self.create_folder(nodes_path)
        for folder in self.structure["Nodes"]:
            self.create_folder(os.path.join(nodes_path, folder))

        # Create the 'Relationships' directory and its subdirectories
        relationships_path = os.path.join(self.base_path, "Relationships")
        self.create_folder(relationships_path)
        for folder in self.structure["Relationships"]:
            self.create_folder(os.path.join(relationships_path, folder))

__init__()

Initializes the DataFolderSetup with the data folder name and directory structure.

Source code in chemgraphbuilder/setup_data_folder.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(self):
    """
    Initializes the DataFolderSetup with the data folder name and directory structure.
    """
    self.data_folder = "Data"
    self.base_path = os.path.join(os.getcwd(), self.data_folder)
    self.structure = {
        "Nodes": ["Compound_Properties"],
        "Relationships": [
            "Assay_Compound_Relationship",
            "Compound_Similarities",
            "Cpd_Cpd_CoOccurrence",
            "Cpd_Gene_CoOccurrence",
            "Compound_Gene_Relationship",
            "Assay_Compound_Relationship_Processed"
        ]
    }

create_folder(path) staticmethod

Creates a folder if it does not already exist.

Parameters:

Name Type Description Default
path str

The path of the folder to create.

required
Source code in chemgraphbuilder/setup_data_folder.py
45
46
47
48
49
50
51
52
53
54
55
56
57
@staticmethod
def create_folder(path):
    """
    Creates a folder if it does not already exist.

    Args:
        path (str): The path of the folder to create.
    """
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created folder: {path}")
    else:
        print(f"Folder already exists: {path}")

setup()

Sets up the data directory structure based on the predefined structure.

Source code in chemgraphbuilder/setup_data_folder.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def setup(self):
    """
    Sets up the data directory structure based on the predefined structure.
    """
    # Create the base data directory
    self.create_folder(self.base_path)

    # Create the 'Nodes' directory and its subdirectories
    nodes_path = os.path.join(self.base_path, "Nodes")
    self.create_folder(nodes_path)
    for folder in self.structure["Nodes"]:
        self.create_folder(os.path.join(nodes_path, folder))

    # Create the 'Relationships' directory and its subdirectories
    relationships_path = os.path.join(self.base_path, "Relationships")
    self.create_folder(relationships_path)
    for folder in self.structure["Relationships"]:
        self.create_folder(os.path.join(relationships_path, folder))

main()

Main function to set up the data directory.

Source code in chemgraphbuilder/setup_data_folder.py
78
79
80
81
82
83
def main():
    """
    Main function to set up the data directory.
    """
    data_folder_setup = SetupDataFolder()
    data_folder_setup.setup()

2. Neo4j Driver

Module for managing connections to a Neo4j database.

This module provides classes and methods to establish and manage connections with a Neo4j database, including custom error handling.

Neo4jBase

Base class to manage connections with the Neo4j database.

Attributes: - uri: The connection URI for the Neo4j database. - user: The username to use for authentication. - driver: The driver object used to interact with the Neo4j database.

Methods: - connect_to_neo4j: Establish a connection to the Neo4j database. - close: Close the connection to the Neo4j database.

Source code in chemgraphbuilder/neo4jdriver.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class Neo4jBase:
    """
    Base class to manage connections with the Neo4j database.

    Attributes:
    - uri: The connection URI for the Neo4j database.
    - user: The username to use for authentication.
    - driver: The driver object used to interact with the Neo4j database.

    Methods:
    - connect_to_neo4j: Establish a connection to the Neo4j database.
    - close: Close the connection to the Neo4j database.
    """

    def __init__(self, logger=None, uri="tcp://5.tcp.eu.ngrok.io:12445", user="neo4j"):
        self.uri = uri
        self.user = user
        self.driver = None

        # Set up logging configuration
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logger or logging.getLogger(__name__)

    def connect_to_neo4j(self):
        """Establish a connection to the Neo4j database using provided URI and username."""
        password = os.getenv("NEO4J_PASSWORD")  # Check if password is set in environment variables
        if not password:
            password = getpass.getpass(prompt="Enter Neo4j password: ")

        try:
            self.driver = GraphDatabase.driver(self.uri, auth=(self.user, password))
            self.logger.info("Successfully connected to the Neo4j database.")
        except Exception as e:
            self.logger.error("Failed to connect to the Neo4j database: %s", e)
            raise Neo4jConnectionError("Failed to connect to the Neo4j database.") from e

    def close(self):
        """Close the connection to the Neo4j database."""
        if self.driver:
            self.driver.close()
            self.logger.info("Neo4j connection closed successfully.")

close()

Close the connection to the Neo4j database.

Source code in chemgraphbuilder/neo4jdriver.py
57
58
59
60
61
def close(self):
    """Close the connection to the Neo4j database."""
    if self.driver:
        self.driver.close()
        self.logger.info("Neo4j connection closed successfully.")

connect_to_neo4j()

Establish a connection to the Neo4j database using provided URI and username.

Source code in chemgraphbuilder/neo4jdriver.py
44
45
46
47
48
49
50
51
52
53
54
55
def connect_to_neo4j(self):
    """Establish a connection to the Neo4j database using provided URI and username."""
    password = os.getenv("NEO4J_PASSWORD")  # Check if password is set in environment variables
    if not password:
        password = getpass.getpass(prompt="Enter Neo4j password: ")

    try:
        self.driver = GraphDatabase.driver(self.uri, auth=(self.user, password))
        self.logger.info("Successfully connected to the Neo4j database.")
    except Exception as e:
        self.logger.error("Failed to connect to the Neo4j database: %s", e)
        raise Neo4jConnectionError("Failed to connect to the Neo4j database.") from e

Neo4jConnectionError

Bases: Exception

Custom exception for Neo4j connection errors.

Source code in chemgraphbuilder/neo4jdriver.py
17
18
class Neo4jConnectionError(Exception):
    """Custom exception for Neo4j connection errors."""

3. Node Properties Extractor

This module defines the NodePropertiesExtractor class, responsible for extracting data from the PubChem database to build knowledge graphs in Neo4j. The class focuses on nodes representing chemical entities and their relationships, allowing users to query chemical data and construct a graph-based representation of chemical compounds, their assays, related genes, and proteins.

The primary functionality revolves around fetching detailed information about specified enzymes from PubChem, including assay data, gene properties, protein properties, and compound properties. It processes this data into a structured format suitable for knowledge graph construction, specifically tailored for use with Neo4j databases.

Classes:

Name Description
- NodePropertiesExtractor

A class to extract data from PubChem to build knowledge graphs in Neo4j.

Usage Example

enzyme_list = ['CYP2D6', 'CYP3A4'] extractor = NodePropertiesExtractor(enzyme_list) df = extractor.run() This example initiates the extractor with a list of enzymes, fetches their data from PubChem, processes it, and potentially prepares it for knowledge graph construction in Neo4j.

Note

To fully utilize this class, ensure you have network access to the PubChem API for data retrieval and a Neo4j database instance for knowledge graph construction. The class methods facilitate data extraction and processing, but integrating the output into Neo4j requires additional steps outside the scope of this class.

NodePropertiesExtractor

Extracts data from PubChem to build knowledge graphs in Neo4j, focusing on nodes representing chemical entities and their relationships. This class serves as a bridge between the PubChem database and Neo4j, allowing users to query chemical data and construct a graph-based representation of chemical compounds, their assays, related genes, and proteins.

The primary functionality revolves around fetching detailed information about specified enzymes from PubChem, including assay data, gene properties, protein properties, and compound properties. It processes this data into a structured format suitable for knowledge graph construction, specifically tailored for use with Neo4j databases.

Attributes:

Name Type Description
enzyme_list list of str

Enzymes to query in the PubChem database.

_base_url str

Base URL for the PubChem API requests.

_sep str

Delimiter for parsing CSV data from PubChem.

_enzyme_count int

Number of enzymes in the enzyme_list, calculated at

Parameters:

Name Type Description Default
enzyme_list list of str

List of enzyme names for which assay data

required
base_url str

Base URL for PubChem API requests. Defaults to

'https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/genesymbol'
sep str

Separator used for parsing CSV data returned

','
Usage Example

enzyme_list = ['CYP2D6', 'CYP3A4'] extractor = NodePropertiesExtractor(enzyme_list) df = extractor.run() This example initiates the extractor with a list of enzymes, fetches their data from PubChem, processes it, and potentially prepares it for knowledge graph construction in Neo4j.

Note

To fully utilize this class, ensure you have network access to the PubChem API for data retrieval and a Neo4j database instance for knowledge graph construction. The class methods facilitate data extraction and processing, but integrating the output into Neo4j requires additional steps outside the scope of this class.

Source code in chemgraphbuilder/node_properties_extractor.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
class NodePropertiesExtractor:
    """
    Extracts data from PubChem to build knowledge graphs in Neo4j,
    focusing on nodes representing chemical entities and their relationships.
    This class serves as a bridge between the PubChem database and Neo4j,
    allowing users to query chemical data and construct a graph-based
    representation of chemical compounds, their assays, related genes, and proteins.

    The primary functionality revolves around fetching detailed information
    about specified enzymes from PubChem, including assay data, gene properties,
    protein properties, and compound properties. It processes this data into
    a structured format suitable for knowledge graph construction, specifically
    tailored for use with Neo4j databases.

    Attributes:
        enzyme_list (list of str): Enzymes to query in the PubChem database.
        _base_url (str): Base URL for the PubChem API requests.
        _sep (str): Delimiter for parsing CSV data from PubChem.
        _enzyme_count (int): Number of enzymes in the enzyme_list, calculated at
        initialization.

    Parameters:
        enzyme_list (list of str): List of enzyme names for which assay data
        will be fetched from PubChem.
        base_url (str, optional): Base URL for PubChem API requests. Defaults to
        the assay target genesymbol endpoint.
        sep (str, optional): Separator used for parsing CSV data returned
        by PubChem. Defaults to ','.

    Usage Example:
        >>> enzyme_list = ['CYP2D6', 'CYP3A4']
        >>> extractor = NodePropertiesExtractor(enzyme_list)
        >>> df = extractor.run()
        This example initiates the extractor with a list of enzymes, fetches
        their data from PubChem, processes it, and potentially prepares it for
        knowledge graph construction in Neo4j.

    Note:
        To fully utilize this class, ensure you have network access to the
        PubChem API for data retrieval and a Neo4j database instance for
        knowledge graph construction. The class methods facilitate data extraction
        and processing, but integrating the output into Neo4j requires additional
        steps outside the scope of this class.
    """

    _REQUEST_TIMEOUT = 30  # in seconds
    _CONCURRENT_REQUEST_LIMIT = 2
    _RETRY_ATTEMPTS = 3  # number of times to retry a failed request

    def __init__(self, enzyme_list,
                 base_url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/genesymbol",
                 sep=","):
        """
        Initializes a NodePropertiesExtractor instance, setting up the base URL
        for API requests, the separator for CSV parsing, and the list of enzymes
        to query from the PubChem database.

        Parameters:
            enzyme_list (list of str): A list of enzyme names for which to fetch
            assay data.
            base_url (str, optional): The base URL for PubChem API requests.
            Default is set to the assay target genesymbol endpoint.
            sep (str, optional): The delimiter to use for parsing CSV files
            returned by PubChem. Defaults to ','.

        Attributes:
            _base_url (str): Stores the base URL for API requests.
            _sep (str): Stores the delimiter for parsing CSV data.
            enzyme_list (list of str): Stores the list of enzyme names provided
            during initialization.
            _enzyme_count (int): The number of enzymes in the enzyme_list.
        """
        self._base_url = base_url
        self._sep = sep
        self.enzyme_list = enzyme_list
        self._enzyme_count = len(enzyme_list)

    def _make_request(self, url):
        """
        Sends an HTTP GET request to a specified URL with built-in retry logic.
        If the request fails, it retries the request up to a predefined number
        of attempts with exponential backoff to handle potential temporary network
        or server issues.

        The method attempts to gracefully handle server-side errors
        (HTTP 4XX/5XX responses) by raising an exception if the response status
        code indicates an error. For client-side errors (e.g., connectivity issues),
        it logs a warning and retries the request.

        Parameters:
            url (str): The complete URL to which the HTTP GET request is sent.

        Returns:
            requests.Response: The response object from the server if the request
            is successfully completed.

        Raises:
            requests.RequestException: If the request fails to complete
            successfully after the maximum number of retry attempts.
        """
        for attempt in range(self._RETRY_ATTEMPTS):
            try:
                response = requests.get(url, timeout=self._REQUEST_TIMEOUT)
                response.raise_for_status()  # Checks for HTTP errors
                return response
            except requests.RequestException as e:
                logging.warning("Attempt %s of %s failed for URL: %s. Error: %s",
                                attempt + 1, self._RETRY_ATTEMPTS, url, e)
                if attempt + 1 == self._RETRY_ATTEMPTS:
                    raise  # All attempts failed; re-raise the last exception
                time.sleep(2 ** attempt)  # Exponential backoff

    def get_enzyme_assays(self, enzyme):
        """
        Fetches assay data for a specified enzyme from the PubChem database and
        returns it as a pandas DataFrame.

        This method constructs a URL to query the PubChem database for concise
        assay data related to the given enzyme. It processes the CSV response
        into a DataFrame, which includes various assay data points provided by PubChem.

        Parameters:
            enzyme (str): The name of the enzyme for which assay data is
            requested. This name is used in the API query.

        Returns:
            pd.DataFrame: A DataFrame containing the assay data fetched from
            PubChem for the specified enzyme. The DataFrame includes columns
            based on the CSV response from PubChem, such as assay ID, results,
            and conditions. Returns None if no data is available or if an error
            occurs during data fetching or processing.

        Raises:
            requests.RequestException: If an error occurs during the HTTP
            request to the PubChem API.
            pd.errors.EmptyDataError: If the response from PubChem contains no data.

        Example:
            >>> extractor = NodePropertiesExtractor(['enzyme'])
            >>> enzyme_assays_df = extractor.get_enzyme_assays('enzyme')
            >>> print(enzyme_assays_df.head())
        """
        assays_url = f"{self._base_url}/{enzyme.lower()}/concise/CSV"
        logging.info(f"Fetching assays for enzyme: {enzyme}")

        response = self._make_request(assays_url)

        assays_csv_string = response.text
        assays_csv_string_io = StringIO(assays_csv_string)
        try:
            assays_df = pd.read_csv(assays_csv_string_io,
                                    sep=self._sep,
                                    low_memory=False)
            logging.info("Assays DataFrame for enzyme %s has shape: %s",
                         enzyme, assays_df.shape)
            return assays_df
        except pd.errors.EmptyDataError:
            logging.warning("No data available for enzyme %s.", enzyme)
            return None

    def _process_enzymes(self, enzyme_list):
        """
        Iterates over a list of enzyme names, fetching assay data for each enzyme
        and aggregating the results into a list of DataFrames.

        This method calls `get_enzyme_assays` for each enzyme in the provided
        list, collecting the assay data (if available) into a list of pandas
        DataFrames. This list can then be used for further processing or analysis.

        Parameters:
            enzyme_list (list of str): A list containing the names of enzymes
            for which to fetch assay data.

        Returns:
            list of pd.DataFrame: A list containing a pandas DataFrame for each
            enzyme for which assay data was successfully fetched and processed.
            Each DataFrame includes the assay data from PubChem for that enzyme.
            If no data is available for an enzyme, it is omitted from the list.
        """
        df_list = [self.get_enzyme_assays(enzyme) for enzyme in enzyme_list]
        return [df for df in df_list if df is not None]

    def _concatenate_data(self, df_list):
        """
        Concatenates a list of pandas DataFrames into a single DataFrame.
        This method is useful for aggregating
        data fetched from multiple sources or APIs into a unified structure.
        If the list is empty, it returns None to indicate that no data was
        aggregated.

        Parameters:
            df_list (List[pd.DataFrame]): A list of pandas DataFrames to
            concatenate. These DataFrames should have the same structure
            (columns) to ensure proper concatenation.

        Returns:
            pd.DataFrame or None: A single concatenated DataFrame comprising all
            rows from the input DataFrames, indexed continuously. Returns None
            if the input list is empty, indicating there is no data to concatenate.
        """
        if df_list:
            return pd.concat(df_list, ignore_index=True)
        return None

    def run(self):
        """
        Orchestrates the process of fetching, filtering, and aggregating assay
        data from PubChem for a predefined list of enzymes.

        This method iteratively queries PubChem for assay data corresponding
        to each enzyme specified in the `enzyme_list` attribute during class
        initialization. It performs the following steps for each enzyme:
        1. Constructs a query URL and fetches assay data from PubChem.
        2. Filters the fetched data based on predefined criteria
        (e.g., containing specific substrings in the assay name).
        3. Aggregates the filtered data into a single pandas DataFrame.
        4. Identifies enzymes for which data could not be fetched or were
        excluded based on filtering criteria, logging their names.

        The final aggregated DataFrame, containing assay data for all successfully
        processed enzymes, is then saved to a CSV file. This method facilitates
        the extraction and preprocessing of chemical assay data for further
        analysis or integration into knowledge graphs.

        Note:
            - This method relies on the successful response from PubChem
            for each enzyme query.
            - Enzymes with no available data or failing to meet the filtering
            criteria are excluded from the final DataFrame.
            - The output CSV file is saved in the current working directory
            with the name 'Data/AllDataConnected.csv'.

        Returns:
            pd.DataFrame: A DataFrame containing the aggregated and filtered
            assay data for the specified enzymes. Columns in the DataFrame
            correspond to the assay data fields returned by PubChem, subject to
            the filtering criteria applied within this method.

        Raises:
            requests.RequestException: If there is an issue with fetching data
            from PubChem, such as a network problem or an invalid response.

        Example:
            Assuming `enzyme_list` was set to ['CYP2D6', 'CYP3A4'] during
            class initialization:

            >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
            >>> extractor.create_data_directories()
            >>> result_df = extractor.run()
            >>> print(result_df.head())

            This will fetch and process assay data for 'CYP2D6' and 'CYP3A4',
            returning a DataFrame with the processed data.
        """

        # Initialize an empty list to store enzymes with successful responses
        enzymes_with_response = []

        # Keep a copy of the original list to identify removed enzymes later
        original_enzyme_list = self.enzyme_list.copy()

        for enzyme in self.enzyme_list:
            # Formulate the URL
            url = f"{self._base_url}/{enzyme}/concise/CSV"

            try:
                response = requests.get(url)
                # Check for a successful response (status code 200)
                if response.status_code == 200:
                    enzymes_with_response.append(enzyme)  # Keep the enzyme in the new list
            except requests.RequestException:
                # If there's an exception, skip adding the enzyme to the new list
                pass

        # Update the enzyme list with only the enzymes that had a successful response
        self.enzyme_list = enzymes_with_response

        # Identify and print the removed enzymes
        removed_enzymes = [enzyme for enzyme in original_enzyme_list if enzyme not in enzymes_with_response]
        if removed_enzymes:
            logging.info("These enzymes were removed because their names aren't correct: %s",
                         ", ".join(removed_enzymes))

        df_list = self._process_enzymes(self.enzyme_list)
        df = self._concatenate_data(df_list)
        substrings_to_filter = ['CYP', 'Cytochrome']
        pattern = '|'.join(substrings_to_filter)
        df = df[df['Assay Name'].str.contains(pattern, case=False, na=False)]
        df.to_csv('Data/AllDataConnected.csv', index=False)
        return df

    def _fetch_gene_details(self, gene_id):
        """
        Fetches gene details in parallel using the PubChem API.

        Args:
            gene_id (int): The gene ID for fetching details.

        Returns:
            tuple: Contains gene ID, symbol, taxonomy, taxonomy ID, and synonyms.
        """
        BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
        url = f"{BASE_URL}/gene/geneid/{int(gene_id)}/summary/JSON"
        try:
            response = self._make_request(url)
            data = response.json()

            # Extracting the necessary details
            symbol = data['GeneSummaries']['GeneSummary'][0].get('Symbol', None)
            taxonomy = data['GeneSummaries']['GeneSummary'][0].get('Taxonomy', None)
            taxonomy_id = data['GeneSummaries']['GeneSummary'][0].get('TaxonomyID', None)
            synonyms = data['GeneSummaries']['GeneSummary'][0].get('Synonym', None)
            # print(type(synonyms))
            return gene_id, symbol, taxonomy, taxonomy_id, synonyms
        except Exception as e:
            logging.error(f"Error fetching details for gene_id {gene_id}: {e}")
            return gene_id, None, None, None, None

    def extract_gene_properties(self, main_data):
        """
        Extracts and processes gene properties from a given data source,
        specifically targeting genes relevant to the study (e.g., CYP enzymes)
        and records their details in a structured DataFrame.

        This method reads gene data from a CSV file specified by `main_data`,
        queries the PubChem database for additional properties of each unique
        gene ID found in the file, and compiles these properties into a new
        DataFrame. It focuses on fetching details like gene symbols, taxonomy,
        taxonomy IDs, and synonyms for each gene. The final DataFrame is filtered
        to include only genes of particular interest (e.g., certain CYP enzymes)
        and saved to a separate CSV file for further analysis or use.

        Parameters:
            main_data (str): Path to a CSV file containing main data was which
            generated after running `extractor.run()`.

        Returns:
            pd.DataFrame: A DataFrame containing the compiled gene properties,
            including GeneID, Symbol, Taxonomy, Taxonomy ID, and Synonyms,
            filtered to include only specified genes of interest. This DataFrame
            is also saved to 'Data/Nodes/Gene_Properties.csv'.

        Raises:
            Exception: If there's an issue reading the initial CSV file or
            fetching gene details from PubChem, details of the exception are
            logged, and the method proceeds to process the next gene ID.

        Example:
            >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
            >>> extractor.create_data_directories()
            >>> extractor.run()
            >>> gene_properties_df = extractor.extract_gene_properties('Data/AllDataConnected.csv')
            >>> print(gene_properties_df.head())

            This would read gene IDs from 'Data/AllDataConnected.csv', fetch
            their properties from PubChem, and compile the details into a
            DataFrame, filtering for specified genes of interest and saving
            the results to 'Data/Nodes/Gene_Properties.csv'.

        Note:
            The method filters the resulting DataFrame to include only genes with
            symbols in the predefined enzyme_list. Adjust this list as necessary
            to match the focus of your study or application.
        """
        df = pd.read_csv(main_data)
        df_gene = pd.DataFrame(columns=['GeneID', 'Symbol', 'Taxonomy',
                                        'Taxonomy ID', 'Synonyms'])

        unique_gene_ids = df['Target GeneID'].unique().tolist()

        gene_details = []

        for gene_id in unique_gene_ids:
            try:
                gene_id, symbol, taxonomy, taxonomy_id, synonyms = self._fetch_gene_details(gene_id)
                gene_details.append({
                    'GeneID': gene_id,
                    'Symbol': symbol,
                    'Taxonomy': taxonomy,
                    'Taxonomy ID': taxonomy_id,
                    'Synonyms': str(synonyms)
                })
            except Exception as exc:
                logging.error("Error occurred while processing gene_id %s: %s",
                              gene_id, exc)
                gene_details.append({
                    'GeneID': gene_id,
                    'Symbol': None,
                    'Taxonomy': None,
                    'Taxonomy ID': None,
                    'Synonyms': None
                })

        # Now create the DataFrame from the list of dictionaries
        df_gene = pd.DataFrame(gene_details)
        n = self._enzyme_count
        gene_ids = df['Target GeneID'].value_counts().head(n).index.tolist()
        df_gene = df_gene[df_gene['GeneID'].isin([int(item) for item in gene_ids])]
        df_gene.to_csv('Data/Nodes/Gene_Properties.csv', sep=',', index=False)
        return df_gene

    def _fetch_assay_details(self, aid):
        """
        Fetches assay details from the PubChem API for a given assay ID.

        Args:
            aid (int): The assay ID to fetch details for.

        Returns:
            dict: A dictionary containing assay details like AID, SourceName,
            SourceID, Name, and Description. Returns None if an error occurs
            during fetching or parsing.
        """
        BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
        url = f"{BASE_URL}/assay/aid/{aid}/summary/XML"  # Constructing the API URL
        response = self._make_request(url)  # Making the API request
        xml_data = response.text  # Getting the response text

        try:
            # Parsing the XML response
            data_dict = xmltodict.parse(xml_data)
            properties = ['AID', 'SourceName', 'SourceID', 'Name', 'Description',
                          'Protocol', 'Comment', 'Method', 'Target', 'CIDCountAll',
                          'CIDCountActive', 'CIDCountInactive', 'CIDCountInconclusive',
                          'CIDCountUnspecified', 'CIDCountProbe']

            assay_data = {}
            # Extracting required properties from the parsed XML
            for prop in properties:
                assay_data[prop] = data_dict.get('AssaySummaries', {}).get('AssaySummary', {}).get(prop, None)
            return assay_data
        except Exception as e:
            logging.error(f"Error parsing XML for AID {aid}: {e}")
            return None

    def extract_assay_properties(self, main_data):
        """
        Extracts detailed properties of assays from PubChem for each unique assay
        ID found in the input data file.

        This method processes an input CSV file containing assay IDs (AID) and
        performs concurrent HTTP requests to fetch detailed assay properties
        from the PubChem database. The retrieved details include assay type,
        activity name, source name, source ID, name, and description. These
        properties are compiled into a new DataFrame, which is then
        saved to a CSV file for further analysis or use.

        The method employs a ThreadPoolExecutor to manage concurrent requests
        efficiently, improving the performance when dealing with a large number
        of assay IDs. Errors encountered during data fetching are logged, and the
        process continues with the next assay ID, ensuring the method's robustness.

        Parameters:
            main_data (str): Path to a CSV file containing main data was which
            generated after running `extractor.run()`.

        Returns:
            pd.DataFrame: A DataFrame containing the fetched assay properties,
            including columns for AID, Assay Type, Activity Name, SourceName,
            SourceID, Name, and Description. This DataFrame is saved to
            'Data/Nodes/Assay_Properties.csv' in the current working directory.

        Raises:
            ValueError: If the input CSV file is empty or does not contain the 'AID' column.

        Example:
            >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
            >>> extractor.create_data_directories()
            >>> extractor.run()
            >>> assay_properties_df = extractor.extract_assay_properties('Data/AllDataConnected.csv')
            >>> print(assay_properties_df.head())

            This example reads assay IDs from 'Data/AllDataConnected.csv',
            queries PubChem for their detailed properties, and compiles the
            results into a DataFrame, which is also saved to 'Data/Nodes/Assay_Properties.csv'.

        Note:
            This method requires network access to the PubChem API and assumes
            the availability of a valid 'AID' column in the input CSV file.
            Ensure the input file path is correct and accessible to avoid errors during processing.
        """

        df = pd.read_csv(main_data)

        # Check if the DataFrame is valid
        if df.empty or 'AID' not in df.columns:
            logging.error("DataFrame is empty or does not contain 'AID' column.")
            return pd.DataFrame()

        unique_aids = df['AID'].unique().tolist()  # Extracting unique assay IDs

        columns = ['AID', 'Assay Type', 'Activity Name', 'SourceName',
                   'SourceID', 'Name', 'Description', 'Protocol',
                   'Comment', 'Method', 'Target', 'CIDCountAll',
                   'CIDCountActive', 'CIDCountInactive', 'CIDCountInconclusive',
                   'CIDCountUnspecified', 'CIDCountProbe']
        assay_df = pd.DataFrame(columns=columns)  # Initializing a DataFrame to store assay properties

        # Using ThreadPoolExecutor for concurrent fetching of assay details
        with concurrent.futures.ThreadPoolExecutor(max_workers=self._CONCURRENT_REQUEST_LIMIT) as executor:
            future_to_aid = {executor.submit(self._fetch_assay_details, aid): aid for aid in unique_aids}

            # Iterating over completed futures
            for future in concurrent.futures.as_completed(future_to_aid):
                aid = future_to_aid[future]
                try:
                    assay_data = future.result()  # Fetching the result from the future
                    if assay_data:
                        # Preparing a new row with the fetched data
                        new_row = {
                            'AID': aid,
                            'Assay Type': df.loc[df['AID'] == aid, 'Assay Type'].iloc[0],
                            'Activity Name': df.loc[df['AID'] == aid, 'Activity Name'].iloc[0],
                            **assay_data
                        }
                        # Adding the new row to the DataFrame
                        assay_df = pd.concat([assay_df, pd.DataFrame([new_row])], ignore_index=True)
                except Exception as exc:
                    # Logging any errors encountered during the fetch
                    logging.error(f"Error occurred while processing AID {aid}: {exc}")

        # Saving the updated DataFrame to a CSV file
        assay_df.to_csv('Data/Nodes/Assay_Properties.csv', sep=',', index=False)
        return assay_df

    def extract_protein_properties(self, main_data):
        """
        Extracts and compiles protein properties from the NCBI protein database
        based on accession numbers.

        Given a CSV file specified by `main_data`, this method reads protein
        accession numbers and performs web scraping on the NCBI protein database
        pages to extract protein titles. The method constructs a URL for
        each accession number, sends a request to retrieve the page content,
        and parses the HTML to find the protein title. The extracted titles,
        along with their corresponding accession numbers and URLs, are
        compiled into a DataFrame. This DataFrame is saved to a CSV file,
        providing a structured summary of protein properties for further analysis or use.

        Parameters:
            main_data (str): Path to a CSV file containing main data was which
            generated after running `extractor.run()`.

        Returns:
            pd.DataFrame: A DataFrame with columns 'RefSeq Accession', 'URL',
            and 'Description', where 'Description' contains the title of the
            protein extracted from its NCBI page. This DataFrame is saved to
            'Data/Nodes/Protein_Properties.csv' in the current working directory.

        Raises:
            Exception: If there's an issue reading the initial CSV file or
            querying the NCBI database, details of the exception are logged.
            The method continues processing the next accession number,
            ensuring robustness against individual failures.

        Example:
            Assuming 'protein_data.csv' contains a column 'Target Accession'
            with accession numbers:

            >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
            >>> extractor.create_data_directories()
            >>> extractor.run() # you need to run this only once
            >>> protein_properties_df = extractor.extract_protein_properties('Data/AllDataConnected.csv')
            >>> print(protein_properties_df.head())

            This would read accession numbers from 'Data/AllDataConnected.csv',
            scrape their titles from the NCBI protein database, and compile the
            results into a DataFrame, which is also saved to
            'Data/Nodes/Protein_Properties.csv'.

        Note:
            This method requires internet access to query the NCBI protein
            database. Ensure the input file path is correct and accessible to
            avoid errors during processing. Web scraping is dependent on the
            structure of the web page; changes to the NCBI protein database
            pages may require updates to the scraping logic.
        """

        # Initialize a list to store the extracted data
        data = []

        n = self._enzyme_count
        df = pd.read_csv(main_data)
        gene_ids = df['Target GeneID'].value_counts().head(n).index.tolist()
        df = df[df['Target GeneID'].isin([int(item) for item in gene_ids])]
        Accessions = df['Target Accession'].unique().tolist()
        # Iterate over each protein accession number in the DataFrame
        for accession in Accessions:
            # Construct the URL to query the NCBI protein database
            url = f"https://www.ncbi.nlm.nih.gov/protein/{accession}"

            try:
                # Send an HTTP request to the URL
                response = requests.get(url)

                # Parse the HTML content of the response
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract the title from the parsed HTML
                title = soup.title.string if soup.title else 'Title Not Found'

                # Append the extracted data to the list
                data.append({'RefSeq Accession': accession,
                             'URL': url, 'Description': title})
            except Exception as e:
                # In case of an error, log the error message
                logging.error(f"Error fetching data for accession {accession}: {e}")
                data.append({'RefSeq Accession': accession, 'URL': url,
                             'Description': f'Error: {e}'})

        # Convert the list of data into a DataFrame
        protein_df = pd.DataFrame(data)

        # Save the DataFrame to a CSV file
        protein_df.to_csv('Data/Nodes/Protein_Properties.csv',
                          sep=',', index=False)

        # Return the DataFrame
        return protein_df

    def fetch_data(self, cid):
        """
        Retrieves detailed chemical compound properties for a specified
        Compound ID (CID) from the PubChem database.

        This method constructs a query URL to fetch a wide range of properties
        for the given CID from PubChem, including molecular formula,
        molecular weight, canonical and isomeric SMILES, InChI codes,
        physicochemical properties, and more. If the CID is valid and data is
        available, it returns a pandas DataFrame containing these properties. This
        method also generates a URL to retrieve the structure image of the
        compound as a 2D PNG image, adding it as a column in the DataFrame.
        In cases where the CID is NaN or an error occurs during data retrieval,
        an empty DataFrame is returned.

        Parameters:
            cid (int or float): The Compound ID for which to fetch data.
            Can be an integer or NaN.

        Returns:
            pd.DataFrame: A DataFrame containing the fetched properties for the
            given CID. The DataFrame includes columns for each property fetched
            from PubChem, along with a 'StructureImage2DURL' column containing
            the URL to the compound's structure image. Returns an empty DataFrame
            if the CID is NaN or if any error occurs during the fetch operation.

        Raises:
            Exception: Logs an error message if the request to PubChem fails or
            if the response cannot be processed into a DataFrame.

        Example:
            >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
            >>> extractor.create_data_directories()
            >>> compound_data_df = extractor.fetch_data(2244)
            >>> print(compound_data_df.head())

            This example fetches the properties for the compound with CID 2244
            from PubChem and prints the first few rows
            of the resulting DataFrame.

        Note:
            This method requires an active internet connection to access the
            PubChem database. Ensure that the CID provided is valid and not NaN
            to avoid fetching errors. The structure and availability of data
            fields are subject to the current state of the PubChem database
            and may vary.
        """
        if pd.isna(cid):
            return pd.DataFrame()  # Return an empty DataFrame for NaN CIDs

        cid = int(cid)  # Convert CID to integer
        url = (f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/"
               "MolecularFormula,MolecularWeight,CanonicalSMILES,IsomericSMILES,InChI,"
               "InChIKey,IUPACName,Title,XLogP,ExactMass,MonoisotopicMass,TPSA,Complexity,"
               "Charge,HBondDonorCount,HBondAcceptorCount,RotatableBondCount,HeavyAtomCount,"
               "IsotopeAtomCount,AtomStereoCount,DefinedAtomStereoCount,UndefinedAtomStereoCount,"
               "BondStereoCount,DefinedBondStereoCount,UndefinedBondStereoCount,CovalentUnitCount,"
               "PatentCount,PatentFamilyCount,LiteratureCount,Volume3D,XStericQuadrupole3D,"
               "YStericQuadrupole3D,ZStericQuadrupole3D,FeatureCount3D,FeatureAcceptorCount3D,"
               "FeatureDonorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureRingCount3D,"
               "FeatureHydrophobeCount3D,ConformerModelRMSD3D,EffectiveRotorCount3D,ConformerCount3D,"
               "Fingerprint2D/CSV")
        try:
            response = requests.get(url)
            response.raise_for_status()
            compound_data = pd.read_csv(StringIO(response.text),
                                        sep=',', low_memory=False)
            compound_data['StructureImage2DURL'] = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/PNG"
            return compound_data
        except Exception as e:
            logging.error(f"Error processing CID {cid}: {e}")
            return pd.DataFrame()  # Return an empty DataFrame in case of error

    def extract_compound_properties(self, main_data, start_chunk=0):
        """
        Extracts and aggregates compound properties from PubChem for a list of
        compounds associated with specific genes.

        This method processes a CSV file specified by `main_data`, which contains
        gene identifiers and their associated compound IDs (CIDs). It selects
        compounds related to the top `n` most frequently occurring genes in the
        dataset, where `n` is determined by the instance's `_enzyme_count`
        attribute. The method then fetches detailed compound properties from
        PubChem in chunks, using concurrent requests to improve efficiency and
        manage the load on the PubChem API. The fetched compound properties are
        aggregated into a single DataFrame and saved to multiple CSV files,
        one for each chunk of compound IDs processed.

        Parameters:
            main_data (str): Path to a CSV file containing main data was which
            generated after running `extractor.run()`.

        Side Effects:
            - Saves the aggregated compound properties to CSV files in the current
            working directory. The files are named
            'Data/Nodes/Compound_Properties/Chunk_{i}.csv', where `{i}` is
            the chunk index.

        Returns:
            None: This method does not return a value. Instead, it saves the
            fetched compound data directly to CSV files.

        Raises:
            Exception: Logs an error and continues processing the next CID if
            an error occurs while fetching data for a specific CID.

        Example:
            >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
            >>> extractor.create_data_directories()
            >>> extractor.extract_compound_properties('Data/AllDataConnected.csv')
            This will read 'Data/AllDataConnected.csv', filter for compounds
            associated with the top n genes, fetch their properties from PubChem,
            and save the results into multiple CSV files for each chunk
            of compounds processed.

        Note:
            - Ensure that the 'main_data' CSV file exists and is accessible at
            the specified path.
            - The method automatically handles NaN values in the 'CID' column
            and excludes them from processing.
            - The `enzyme_count` attribute determines the number of top genes
            for which compound properties will be fetched.
            - Internet access is required to fetch compound data from the PubChem API.
            - The method employs a `ThreadPoolExecutor` with a configurable
            number of workers (default is len(enzyme_list)) to parallelize
            requests, which can be adjusted based on system capabilities and
            API rate limits.
        """

        n = self._enzyme_count
        df = pd.read_csv(main_data)
        gene_ids = df['Target GeneID'].value_counts().head(n).index.tolist()
        df = df[df['Target GeneID'].isin([int(item) for item in gene_ids])]
        df = df.dropna(subset=['CID'])
        IDs = df['CID'].unique().tolist()

        # Define chunk size and calculate number of chunks
        chunk_size = 10000
        num_chunks = math.ceil(len(IDs) / chunk_size)

        if num_chunks >= start_chunk:
            for i in range(start_chunk, num_chunks):
                # Calculate start and end indices for each chunk
                start_index = i * chunk_size
                end_index = start_index + chunk_size

                # Extract chunk of CIDs
                chunk_cids = IDs[start_index:end_index]
                # chunk_cids = [x for x in chunk_cids if not np.isnan(x)]

                # Use ThreadPoolExecutor to parallelize requests for the chunk
                with ThreadPoolExecutor(max_workers=5) as executor:
                    future_to_cid = {executor.submit(self.fetch_data, cid): cid for cid in chunk_cids}
                    results = []

                    for future in as_completed(future_to_cid):
                        cid = future_to_cid[future]
                        try:
                            data = future.result()
                            results.append(data)
                        except Exception as e:
                            logging.error(f"Error processing CID {cid}: {e}")

                # Concatenate results for the current chunk
                chunk_df = pd.concat(results, ignore_index=True)

                # Save the concatenated DataFrame to a CSV file for the chunk
                chunk_df.to_csv(f'Data/Nodes/Compound_Properties/Chunk_{i}.csv',
                                sep=',', index=False)
        else:
            logging.info("No more chunks to process.")

__init__(enzyme_list, base_url='https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/genesymbol', sep=',')

Initializes a NodePropertiesExtractor instance, setting up the base URL for API requests, the separator for CSV parsing, and the list of enzymes to query from the PubChem database.

Parameters:

Name Type Description Default
enzyme_list list of str

A list of enzyme names for which to fetch

required
base_url str

The base URL for PubChem API requests.

'https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/genesymbol'
sep str

The delimiter to use for parsing CSV files

','

Attributes:

Name Type Description
_base_url str

Stores the base URL for API requests.

_sep str

Stores the delimiter for parsing CSV data.

enzyme_list list of str

Stores the list of enzyme names provided

_enzyme_count int

The number of enzymes in the enzyme_list.

Source code in chemgraphbuilder/node_properties_extractor.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def __init__(self, enzyme_list,
             base_url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/genesymbol",
             sep=","):
    """
    Initializes a NodePropertiesExtractor instance, setting up the base URL
    for API requests, the separator for CSV parsing, and the list of enzymes
    to query from the PubChem database.

    Parameters:
        enzyme_list (list of str): A list of enzyme names for which to fetch
        assay data.
        base_url (str, optional): The base URL for PubChem API requests.
        Default is set to the assay target genesymbol endpoint.
        sep (str, optional): The delimiter to use for parsing CSV files
        returned by PubChem. Defaults to ','.

    Attributes:
        _base_url (str): Stores the base URL for API requests.
        _sep (str): Stores the delimiter for parsing CSV data.
        enzyme_list (list of str): Stores the list of enzyme names provided
        during initialization.
        _enzyme_count (int): The number of enzymes in the enzyme_list.
    """
    self._base_url = base_url
    self._sep = sep
    self.enzyme_list = enzyme_list
    self._enzyme_count = len(enzyme_list)

extract_assay_properties(main_data)

Extracts detailed properties of assays from PubChem for each unique assay ID found in the input data file.

This method processes an input CSV file containing assay IDs (AID) and performs concurrent HTTP requests to fetch detailed assay properties from the PubChem database. The retrieved details include assay type, activity name, source name, source ID, name, and description. These properties are compiled into a new DataFrame, which is then saved to a CSV file for further analysis or use.

The method employs a ThreadPoolExecutor to manage concurrent requests efficiently, improving the performance when dealing with a large number of assay IDs. Errors encountered during data fetching are logged, and the process continues with the next assay ID, ensuring the method's robustness.

Parameters:

Name Type Description Default
main_data str

Path to a CSV file containing main data was which

required

Returns:

Type Description

pd.DataFrame: A DataFrame containing the fetched assay properties,

including columns for AID, Assay Type, Activity Name, SourceName,

SourceID, Name, and Description. This DataFrame is saved to

'Data/Nodes/Assay_Properties.csv' in the current working directory.

Raises:

Type Description
ValueError

If the input CSV file is empty or does not contain the 'AID' column.

Example

extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4']) extractor.create_data_directories() extractor.run() assay_properties_df = extractor.extract_assay_properties('Data/AllDataConnected.csv') print(assay_properties_df.head())

This example reads assay IDs from 'Data/AllDataConnected.csv', queries PubChem for their detailed properties, and compiles the results into a DataFrame, which is also saved to 'Data/Nodes/Assay_Properties.csv'.

Note

This method requires network access to the PubChem API and assumes the availability of a valid 'AID' column in the input CSV file. Ensure the input file path is correct and accessible to avoid errors during processing.

Source code in chemgraphbuilder/node_properties_extractor.py
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
def extract_assay_properties(self, main_data):
    """
    Extracts detailed properties of assays from PubChem for each unique assay
    ID found in the input data file.

    This method processes an input CSV file containing assay IDs (AID) and
    performs concurrent HTTP requests to fetch detailed assay properties
    from the PubChem database. The retrieved details include assay type,
    activity name, source name, source ID, name, and description. These
    properties are compiled into a new DataFrame, which is then
    saved to a CSV file for further analysis or use.

    The method employs a ThreadPoolExecutor to manage concurrent requests
    efficiently, improving the performance when dealing with a large number
    of assay IDs. Errors encountered during data fetching are logged, and the
    process continues with the next assay ID, ensuring the method's robustness.

    Parameters:
        main_data (str): Path to a CSV file containing main data was which
        generated after running `extractor.run()`.

    Returns:
        pd.DataFrame: A DataFrame containing the fetched assay properties,
        including columns for AID, Assay Type, Activity Name, SourceName,
        SourceID, Name, and Description. This DataFrame is saved to
        'Data/Nodes/Assay_Properties.csv' in the current working directory.

    Raises:
        ValueError: If the input CSV file is empty or does not contain the 'AID' column.

    Example:
        >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
        >>> extractor.create_data_directories()
        >>> extractor.run()
        >>> assay_properties_df = extractor.extract_assay_properties('Data/AllDataConnected.csv')
        >>> print(assay_properties_df.head())

        This example reads assay IDs from 'Data/AllDataConnected.csv',
        queries PubChem for their detailed properties, and compiles the
        results into a DataFrame, which is also saved to 'Data/Nodes/Assay_Properties.csv'.

    Note:
        This method requires network access to the PubChem API and assumes
        the availability of a valid 'AID' column in the input CSV file.
        Ensure the input file path is correct and accessible to avoid errors during processing.
    """

    df = pd.read_csv(main_data)

    # Check if the DataFrame is valid
    if df.empty or 'AID' not in df.columns:
        logging.error("DataFrame is empty or does not contain 'AID' column.")
        return pd.DataFrame()

    unique_aids = df['AID'].unique().tolist()  # Extracting unique assay IDs

    columns = ['AID', 'Assay Type', 'Activity Name', 'SourceName',
               'SourceID', 'Name', 'Description', 'Protocol',
               'Comment', 'Method', 'Target', 'CIDCountAll',
               'CIDCountActive', 'CIDCountInactive', 'CIDCountInconclusive',
               'CIDCountUnspecified', 'CIDCountProbe']
    assay_df = pd.DataFrame(columns=columns)  # Initializing a DataFrame to store assay properties

    # Using ThreadPoolExecutor for concurrent fetching of assay details
    with concurrent.futures.ThreadPoolExecutor(max_workers=self._CONCURRENT_REQUEST_LIMIT) as executor:
        future_to_aid = {executor.submit(self._fetch_assay_details, aid): aid for aid in unique_aids}

        # Iterating over completed futures
        for future in concurrent.futures.as_completed(future_to_aid):
            aid = future_to_aid[future]
            try:
                assay_data = future.result()  # Fetching the result from the future
                if assay_data:
                    # Preparing a new row with the fetched data
                    new_row = {
                        'AID': aid,
                        'Assay Type': df.loc[df['AID'] == aid, 'Assay Type'].iloc[0],
                        'Activity Name': df.loc[df['AID'] == aid, 'Activity Name'].iloc[0],
                        **assay_data
                    }
                    # Adding the new row to the DataFrame
                    assay_df = pd.concat([assay_df, pd.DataFrame([new_row])], ignore_index=True)
            except Exception as exc:
                # Logging any errors encountered during the fetch
                logging.error(f"Error occurred while processing AID {aid}: {exc}")

    # Saving the updated DataFrame to a CSV file
    assay_df.to_csv('Data/Nodes/Assay_Properties.csv', sep=',', index=False)
    return assay_df

extract_compound_properties(main_data, start_chunk=0)

Extracts and aggregates compound properties from PubChem for a list of compounds associated with specific genes.

This method processes a CSV file specified by main_data, which contains gene identifiers and their associated compound IDs (CIDs). It selects compounds related to the top n most frequently occurring genes in the dataset, where n is determined by the instance's _enzyme_count attribute. The method then fetches detailed compound properties from PubChem in chunks, using concurrent requests to improve efficiency and manage the load on the PubChem API. The fetched compound properties are aggregated into a single DataFrame and saved to multiple CSV files, one for each chunk of compound IDs processed.

Parameters:

Name Type Description Default
main_data str

Path to a CSV file containing main data was which

required
Side Effects
  • Saves the aggregated compound properties to CSV files in the current working directory. The files are named 'Data/Nodes/Compound_Properties/Chunk_{i}.csv', where {i} is the chunk index.

Returns:

Name Type Description
None

This method does not return a value. Instead, it saves the

fetched compound data directly to CSV files.

Raises:

Type Description
Exception

Logs an error and continues processing the next CID if

Example

extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4']) extractor.create_data_directories() extractor.extract_compound_properties('Data/AllDataConnected.csv') This will read 'Data/AllDataConnected.csv', filter for compounds associated with the top n genes, fetch their properties from PubChem, and save the results into multiple CSV files for each chunk of compounds processed.

Note
  • Ensure that the 'main_data' CSV file exists and is accessible at the specified path.
  • The method automatically handles NaN values in the 'CID' column and excludes them from processing.
  • The enzyme_count attribute determines the number of top genes for which compound properties will be fetched.
  • Internet access is required to fetch compound data from the PubChem API.
  • The method employs a ThreadPoolExecutor with a configurable number of workers (default is len(enzyme_list)) to parallelize requests, which can be adjusted based on system capabilities and API rate limits.
Source code in chemgraphbuilder/node_properties_extractor.py
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
def extract_compound_properties(self, main_data, start_chunk=0):
    """
    Extracts and aggregates compound properties from PubChem for a list of
    compounds associated with specific genes.

    This method processes a CSV file specified by `main_data`, which contains
    gene identifiers and their associated compound IDs (CIDs). It selects
    compounds related to the top `n` most frequently occurring genes in the
    dataset, where `n` is determined by the instance's `_enzyme_count`
    attribute. The method then fetches detailed compound properties from
    PubChem in chunks, using concurrent requests to improve efficiency and
    manage the load on the PubChem API. The fetched compound properties are
    aggregated into a single DataFrame and saved to multiple CSV files,
    one for each chunk of compound IDs processed.

    Parameters:
        main_data (str): Path to a CSV file containing main data was which
        generated after running `extractor.run()`.

    Side Effects:
        - Saves the aggregated compound properties to CSV files in the current
        working directory. The files are named
        'Data/Nodes/Compound_Properties/Chunk_{i}.csv', where `{i}` is
        the chunk index.

    Returns:
        None: This method does not return a value. Instead, it saves the
        fetched compound data directly to CSV files.

    Raises:
        Exception: Logs an error and continues processing the next CID if
        an error occurs while fetching data for a specific CID.

    Example:
        >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
        >>> extractor.create_data_directories()
        >>> extractor.extract_compound_properties('Data/AllDataConnected.csv')
        This will read 'Data/AllDataConnected.csv', filter for compounds
        associated with the top n genes, fetch their properties from PubChem,
        and save the results into multiple CSV files for each chunk
        of compounds processed.

    Note:
        - Ensure that the 'main_data' CSV file exists and is accessible at
        the specified path.
        - The method automatically handles NaN values in the 'CID' column
        and excludes them from processing.
        - The `enzyme_count` attribute determines the number of top genes
        for which compound properties will be fetched.
        - Internet access is required to fetch compound data from the PubChem API.
        - The method employs a `ThreadPoolExecutor` with a configurable
        number of workers (default is len(enzyme_list)) to parallelize
        requests, which can be adjusted based on system capabilities and
        API rate limits.
    """

    n = self._enzyme_count
    df = pd.read_csv(main_data)
    gene_ids = df['Target GeneID'].value_counts().head(n).index.tolist()
    df = df[df['Target GeneID'].isin([int(item) for item in gene_ids])]
    df = df.dropna(subset=['CID'])
    IDs = df['CID'].unique().tolist()

    # Define chunk size and calculate number of chunks
    chunk_size = 10000
    num_chunks = math.ceil(len(IDs) / chunk_size)

    if num_chunks >= start_chunk:
        for i in range(start_chunk, num_chunks):
            # Calculate start and end indices for each chunk
            start_index = i * chunk_size
            end_index = start_index + chunk_size

            # Extract chunk of CIDs
            chunk_cids = IDs[start_index:end_index]
            # chunk_cids = [x for x in chunk_cids if not np.isnan(x)]

            # Use ThreadPoolExecutor to parallelize requests for the chunk
            with ThreadPoolExecutor(max_workers=5) as executor:
                future_to_cid = {executor.submit(self.fetch_data, cid): cid for cid in chunk_cids}
                results = []

                for future in as_completed(future_to_cid):
                    cid = future_to_cid[future]
                    try:
                        data = future.result()
                        results.append(data)
                    except Exception as e:
                        logging.error(f"Error processing CID {cid}: {e}")

            # Concatenate results for the current chunk
            chunk_df = pd.concat(results, ignore_index=True)

            # Save the concatenated DataFrame to a CSV file for the chunk
            chunk_df.to_csv(f'Data/Nodes/Compound_Properties/Chunk_{i}.csv',
                            sep=',', index=False)
    else:
        logging.info("No more chunks to process.")

extract_gene_properties(main_data)

Extracts and processes gene properties from a given data source, specifically targeting genes relevant to the study (e.g., CYP enzymes) and records their details in a structured DataFrame.

This method reads gene data from a CSV file specified by main_data, queries the PubChem database for additional properties of each unique gene ID found in the file, and compiles these properties into a new DataFrame. It focuses on fetching details like gene symbols, taxonomy, taxonomy IDs, and synonyms for each gene. The final DataFrame is filtered to include only genes of particular interest (e.g., certain CYP enzymes) and saved to a separate CSV file for further analysis or use.

Parameters:

Name Type Description Default
main_data str

Path to a CSV file containing main data was which

required

Returns:

Type Description

pd.DataFrame: A DataFrame containing the compiled gene properties,

including GeneID, Symbol, Taxonomy, Taxonomy ID, and Synonyms,

filtered to include only specified genes of interest. This DataFrame

is also saved to 'Data/Nodes/Gene_Properties.csv'.

Raises:

Type Description
Exception

If there's an issue reading the initial CSV file or

Example

extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4']) extractor.create_data_directories() extractor.run() gene_properties_df = extractor.extract_gene_properties('Data/AllDataConnected.csv') print(gene_properties_df.head())

This would read gene IDs from 'Data/AllDataConnected.csv', fetch their properties from PubChem, and compile the details into a DataFrame, filtering for specified genes of interest and saving the results to 'Data/Nodes/Gene_Properties.csv'.

Note

The method filters the resulting DataFrame to include only genes with symbols in the predefined enzyme_list. Adjust this list as necessary to match the focus of your study or application.

Source code in chemgraphbuilder/node_properties_extractor.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def extract_gene_properties(self, main_data):
    """
    Extracts and processes gene properties from a given data source,
    specifically targeting genes relevant to the study (e.g., CYP enzymes)
    and records their details in a structured DataFrame.

    This method reads gene data from a CSV file specified by `main_data`,
    queries the PubChem database for additional properties of each unique
    gene ID found in the file, and compiles these properties into a new
    DataFrame. It focuses on fetching details like gene symbols, taxonomy,
    taxonomy IDs, and synonyms for each gene. The final DataFrame is filtered
    to include only genes of particular interest (e.g., certain CYP enzymes)
    and saved to a separate CSV file for further analysis or use.

    Parameters:
        main_data (str): Path to a CSV file containing main data was which
        generated after running `extractor.run()`.

    Returns:
        pd.DataFrame: A DataFrame containing the compiled gene properties,
        including GeneID, Symbol, Taxonomy, Taxonomy ID, and Synonyms,
        filtered to include only specified genes of interest. This DataFrame
        is also saved to 'Data/Nodes/Gene_Properties.csv'.

    Raises:
        Exception: If there's an issue reading the initial CSV file or
        fetching gene details from PubChem, details of the exception are
        logged, and the method proceeds to process the next gene ID.

    Example:
        >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
        >>> extractor.create_data_directories()
        >>> extractor.run()
        >>> gene_properties_df = extractor.extract_gene_properties('Data/AllDataConnected.csv')
        >>> print(gene_properties_df.head())

        This would read gene IDs from 'Data/AllDataConnected.csv', fetch
        their properties from PubChem, and compile the details into a
        DataFrame, filtering for specified genes of interest and saving
        the results to 'Data/Nodes/Gene_Properties.csv'.

    Note:
        The method filters the resulting DataFrame to include only genes with
        symbols in the predefined enzyme_list. Adjust this list as necessary
        to match the focus of your study or application.
    """
    df = pd.read_csv(main_data)
    df_gene = pd.DataFrame(columns=['GeneID', 'Symbol', 'Taxonomy',
                                    'Taxonomy ID', 'Synonyms'])

    unique_gene_ids = df['Target GeneID'].unique().tolist()

    gene_details = []

    for gene_id in unique_gene_ids:
        try:
            gene_id, symbol, taxonomy, taxonomy_id, synonyms = self._fetch_gene_details(gene_id)
            gene_details.append({
                'GeneID': gene_id,
                'Symbol': symbol,
                'Taxonomy': taxonomy,
                'Taxonomy ID': taxonomy_id,
                'Synonyms': str(synonyms)
            })
        except Exception as exc:
            logging.error("Error occurred while processing gene_id %s: %s",
                          gene_id, exc)
            gene_details.append({
                'GeneID': gene_id,
                'Symbol': None,
                'Taxonomy': None,
                'Taxonomy ID': None,
                'Synonyms': None
            })

    # Now create the DataFrame from the list of dictionaries
    df_gene = pd.DataFrame(gene_details)
    n = self._enzyme_count
    gene_ids = df['Target GeneID'].value_counts().head(n).index.tolist()
    df_gene = df_gene[df_gene['GeneID'].isin([int(item) for item in gene_ids])]
    df_gene.to_csv('Data/Nodes/Gene_Properties.csv', sep=',', index=False)
    return df_gene

extract_protein_properties(main_data)

Extracts and compiles protein properties from the NCBI protein database based on accession numbers.

Given a CSV file specified by main_data, this method reads protein accession numbers and performs web scraping on the NCBI protein database pages to extract protein titles. The method constructs a URL for each accession number, sends a request to retrieve the page content, and parses the HTML to find the protein title. The extracted titles, along with their corresponding accession numbers and URLs, are compiled into a DataFrame. This DataFrame is saved to a CSV file, providing a structured summary of protein properties for further analysis or use.

Parameters:

Name Type Description Default
main_data str

Path to a CSV file containing main data was which

required

Returns:

Type Description

pd.DataFrame: A DataFrame with columns 'RefSeq Accession', 'URL',

and 'Description', where 'Description' contains the title of the

protein extracted from its NCBI page. This DataFrame is saved to

'Data/Nodes/Protein_Properties.csv' in the current working directory.

Raises:

Type Description
Exception

If there's an issue reading the initial CSV file or

Example

Assuming 'protein_data.csv' contains a column 'Target Accession' with accession numbers:

extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4']) extractor.create_data_directories() extractor.run() # you need to run this only once protein_properties_df = extractor.extract_protein_properties('Data/AllDataConnected.csv') print(protein_properties_df.head())

This would read accession numbers from 'Data/AllDataConnected.csv', scrape their titles from the NCBI protein database, and compile the results into a DataFrame, which is also saved to 'Data/Nodes/Protein_Properties.csv'.

Note

This method requires internet access to query the NCBI protein database. Ensure the input file path is correct and accessible to avoid errors during processing. Web scraping is dependent on the structure of the web page; changes to the NCBI protein database pages may require updates to the scraping logic.

Source code in chemgraphbuilder/node_properties_extractor.py
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
def extract_protein_properties(self, main_data):
    """
    Extracts and compiles protein properties from the NCBI protein database
    based on accession numbers.

    Given a CSV file specified by `main_data`, this method reads protein
    accession numbers and performs web scraping on the NCBI protein database
    pages to extract protein titles. The method constructs a URL for
    each accession number, sends a request to retrieve the page content,
    and parses the HTML to find the protein title. The extracted titles,
    along with their corresponding accession numbers and URLs, are
    compiled into a DataFrame. This DataFrame is saved to a CSV file,
    providing a structured summary of protein properties for further analysis or use.

    Parameters:
        main_data (str): Path to a CSV file containing main data was which
        generated after running `extractor.run()`.

    Returns:
        pd.DataFrame: A DataFrame with columns 'RefSeq Accession', 'URL',
        and 'Description', where 'Description' contains the title of the
        protein extracted from its NCBI page. This DataFrame is saved to
        'Data/Nodes/Protein_Properties.csv' in the current working directory.

    Raises:
        Exception: If there's an issue reading the initial CSV file or
        querying the NCBI database, details of the exception are logged.
        The method continues processing the next accession number,
        ensuring robustness against individual failures.

    Example:
        Assuming 'protein_data.csv' contains a column 'Target Accession'
        with accession numbers:

        >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
        >>> extractor.create_data_directories()
        >>> extractor.run() # you need to run this only once
        >>> protein_properties_df = extractor.extract_protein_properties('Data/AllDataConnected.csv')
        >>> print(protein_properties_df.head())

        This would read accession numbers from 'Data/AllDataConnected.csv',
        scrape their titles from the NCBI protein database, and compile the
        results into a DataFrame, which is also saved to
        'Data/Nodes/Protein_Properties.csv'.

    Note:
        This method requires internet access to query the NCBI protein
        database. Ensure the input file path is correct and accessible to
        avoid errors during processing. Web scraping is dependent on the
        structure of the web page; changes to the NCBI protein database
        pages may require updates to the scraping logic.
    """

    # Initialize a list to store the extracted data
    data = []

    n = self._enzyme_count
    df = pd.read_csv(main_data)
    gene_ids = df['Target GeneID'].value_counts().head(n).index.tolist()
    df = df[df['Target GeneID'].isin([int(item) for item in gene_ids])]
    Accessions = df['Target Accession'].unique().tolist()
    # Iterate over each protein accession number in the DataFrame
    for accession in Accessions:
        # Construct the URL to query the NCBI protein database
        url = f"https://www.ncbi.nlm.nih.gov/protein/{accession}"

        try:
            # Send an HTTP request to the URL
            response = requests.get(url)

            # Parse the HTML content of the response
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract the title from the parsed HTML
            title = soup.title.string if soup.title else 'Title Not Found'

            # Append the extracted data to the list
            data.append({'RefSeq Accession': accession,
                         'URL': url, 'Description': title})
        except Exception as e:
            # In case of an error, log the error message
            logging.error(f"Error fetching data for accession {accession}: {e}")
            data.append({'RefSeq Accession': accession, 'URL': url,
                         'Description': f'Error: {e}'})

    # Convert the list of data into a DataFrame
    protein_df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    protein_df.to_csv('Data/Nodes/Protein_Properties.csv',
                      sep=',', index=False)

    # Return the DataFrame
    return protein_df

fetch_data(cid)

Retrieves detailed chemical compound properties for a specified Compound ID (CID) from the PubChem database.

This method constructs a query URL to fetch a wide range of properties for the given CID from PubChem, including molecular formula, molecular weight, canonical and isomeric SMILES, InChI codes, physicochemical properties, and more. If the CID is valid and data is available, it returns a pandas DataFrame containing these properties. This method also generates a URL to retrieve the structure image of the compound as a 2D PNG image, adding it as a column in the DataFrame. In cases where the CID is NaN or an error occurs during data retrieval, an empty DataFrame is returned.

Parameters:

Name Type Description Default
cid int or float

The Compound ID for which to fetch data.

required

Returns:

Type Description

pd.DataFrame: A DataFrame containing the fetched properties for the

given CID. The DataFrame includes columns for each property fetched

from PubChem, along with a 'StructureImage2DURL' column containing

the URL to the compound's structure image. Returns an empty DataFrame

if the CID is NaN or if any error occurs during the fetch operation.

Raises:

Type Description
Exception

Logs an error message if the request to PubChem fails or

Example

extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4']) extractor.create_data_directories() compound_data_df = extractor.fetch_data(2244) print(compound_data_df.head())

This example fetches the properties for the compound with CID 2244 from PubChem and prints the first few rows of the resulting DataFrame.

Note

This method requires an active internet connection to access the PubChem database. Ensure that the CID provided is valid and not NaN to avoid fetching errors. The structure and availability of data fields are subject to the current state of the PubChem database and may vary.

Source code in chemgraphbuilder/node_properties_extractor.py
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
def fetch_data(self, cid):
    """
    Retrieves detailed chemical compound properties for a specified
    Compound ID (CID) from the PubChem database.

    This method constructs a query URL to fetch a wide range of properties
    for the given CID from PubChem, including molecular formula,
    molecular weight, canonical and isomeric SMILES, InChI codes,
    physicochemical properties, and more. If the CID is valid and data is
    available, it returns a pandas DataFrame containing these properties. This
    method also generates a URL to retrieve the structure image of the
    compound as a 2D PNG image, adding it as a column in the DataFrame.
    In cases where the CID is NaN or an error occurs during data retrieval,
    an empty DataFrame is returned.

    Parameters:
        cid (int or float): The Compound ID for which to fetch data.
        Can be an integer or NaN.

    Returns:
        pd.DataFrame: A DataFrame containing the fetched properties for the
        given CID. The DataFrame includes columns for each property fetched
        from PubChem, along with a 'StructureImage2DURL' column containing
        the URL to the compound's structure image. Returns an empty DataFrame
        if the CID is NaN or if any error occurs during the fetch operation.

    Raises:
        Exception: Logs an error message if the request to PubChem fails or
        if the response cannot be processed into a DataFrame.

    Example:
        >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
        >>> extractor.create_data_directories()
        >>> compound_data_df = extractor.fetch_data(2244)
        >>> print(compound_data_df.head())

        This example fetches the properties for the compound with CID 2244
        from PubChem and prints the first few rows
        of the resulting DataFrame.

    Note:
        This method requires an active internet connection to access the
        PubChem database. Ensure that the CID provided is valid and not NaN
        to avoid fetching errors. The structure and availability of data
        fields are subject to the current state of the PubChem database
        and may vary.
    """
    if pd.isna(cid):
        return pd.DataFrame()  # Return an empty DataFrame for NaN CIDs

    cid = int(cid)  # Convert CID to integer
    url = (f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/"
           "MolecularFormula,MolecularWeight,CanonicalSMILES,IsomericSMILES,InChI,"
           "InChIKey,IUPACName,Title,XLogP,ExactMass,MonoisotopicMass,TPSA,Complexity,"
           "Charge,HBondDonorCount,HBondAcceptorCount,RotatableBondCount,HeavyAtomCount,"
           "IsotopeAtomCount,AtomStereoCount,DefinedAtomStereoCount,UndefinedAtomStereoCount,"
           "BondStereoCount,DefinedBondStereoCount,UndefinedBondStereoCount,CovalentUnitCount,"
           "PatentCount,PatentFamilyCount,LiteratureCount,Volume3D,XStericQuadrupole3D,"
           "YStericQuadrupole3D,ZStericQuadrupole3D,FeatureCount3D,FeatureAcceptorCount3D,"
           "FeatureDonorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureRingCount3D,"
           "FeatureHydrophobeCount3D,ConformerModelRMSD3D,EffectiveRotorCount3D,ConformerCount3D,"
           "Fingerprint2D/CSV")
    try:
        response = requests.get(url)
        response.raise_for_status()
        compound_data = pd.read_csv(StringIO(response.text),
                                    sep=',', low_memory=False)
        compound_data['StructureImage2DURL'] = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/PNG"
        return compound_data
    except Exception as e:
        logging.error(f"Error processing CID {cid}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error

get_enzyme_assays(enzyme)

Fetches assay data for a specified enzyme from the PubChem database and returns it as a pandas DataFrame.

This method constructs a URL to query the PubChem database for concise assay data related to the given enzyme. It processes the CSV response into a DataFrame, which includes various assay data points provided by PubChem.

Parameters:

Name Type Description Default
enzyme str

The name of the enzyme for which assay data is

required

Returns:

Type Description

pd.DataFrame: A DataFrame containing the assay data fetched from

PubChem for the specified enzyme. The DataFrame includes columns

based on the CSV response from PubChem, such as assay ID, results,

and conditions. Returns None if no data is available or if an error

occurs during data fetching or processing.

Raises:

Type Description
RequestException

If an error occurs during the HTTP

EmptyDataError

If the response from PubChem contains no data.

Example

extractor = NodePropertiesExtractor(['enzyme']) enzyme_assays_df = extractor.get_enzyme_assays('enzyme') print(enzyme_assays_df.head())

Source code in chemgraphbuilder/node_properties_extractor.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def get_enzyme_assays(self, enzyme):
    """
    Fetches assay data for a specified enzyme from the PubChem database and
    returns it as a pandas DataFrame.

    This method constructs a URL to query the PubChem database for concise
    assay data related to the given enzyme. It processes the CSV response
    into a DataFrame, which includes various assay data points provided by PubChem.

    Parameters:
        enzyme (str): The name of the enzyme for which assay data is
        requested. This name is used in the API query.

    Returns:
        pd.DataFrame: A DataFrame containing the assay data fetched from
        PubChem for the specified enzyme. The DataFrame includes columns
        based on the CSV response from PubChem, such as assay ID, results,
        and conditions. Returns None if no data is available or if an error
        occurs during data fetching or processing.

    Raises:
        requests.RequestException: If an error occurs during the HTTP
        request to the PubChem API.
        pd.errors.EmptyDataError: If the response from PubChem contains no data.

    Example:
        >>> extractor = NodePropertiesExtractor(['enzyme'])
        >>> enzyme_assays_df = extractor.get_enzyme_assays('enzyme')
        >>> print(enzyme_assays_df.head())
    """
    assays_url = f"{self._base_url}/{enzyme.lower()}/concise/CSV"
    logging.info(f"Fetching assays for enzyme: {enzyme}")

    response = self._make_request(assays_url)

    assays_csv_string = response.text
    assays_csv_string_io = StringIO(assays_csv_string)
    try:
        assays_df = pd.read_csv(assays_csv_string_io,
                                sep=self._sep,
                                low_memory=False)
        logging.info("Assays DataFrame for enzyme %s has shape: %s",
                     enzyme, assays_df.shape)
        return assays_df
    except pd.errors.EmptyDataError:
        logging.warning("No data available for enzyme %s.", enzyme)
        return None

run()

Orchestrates the process of fetching, filtering, and aggregating assay data from PubChem for a predefined list of enzymes.

This method iteratively queries PubChem for assay data corresponding to each enzyme specified in the enzyme_list attribute during class initialization. It performs the following steps for each enzyme: 1. Constructs a query URL and fetches assay data from PubChem. 2. Filters the fetched data based on predefined criteria (e.g., containing specific substrings in the assay name). 3. Aggregates the filtered data into a single pandas DataFrame. 4. Identifies enzymes for which data could not be fetched or were excluded based on filtering criteria, logging their names.

The final aggregated DataFrame, containing assay data for all successfully processed enzymes, is then saved to a CSV file. This method facilitates the extraction and preprocessing of chemical assay data for further analysis or integration into knowledge graphs.

Note
  • This method relies on the successful response from PubChem for each enzyme query.
  • Enzymes with no available data or failing to meet the filtering criteria are excluded from the final DataFrame.
  • The output CSV file is saved in the current working directory with the name 'Data/AllDataConnected.csv'.

Returns:

Type Description

pd.DataFrame: A DataFrame containing the aggregated and filtered

assay data for the specified enzymes. Columns in the DataFrame

correspond to the assay data fields returned by PubChem, subject to

the filtering criteria applied within this method.

Raises:

Type Description
RequestException

If there is an issue with fetching data

Example

Assuming enzyme_list was set to ['CYP2D6', 'CYP3A4'] during class initialization:

extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4']) extractor.create_data_directories() result_df = extractor.run() print(result_df.head())

This will fetch and process assay data for 'CYP2D6' and 'CYP3A4', returning a DataFrame with the processed data.

Source code in chemgraphbuilder/node_properties_extractor.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
def run(self):
    """
    Orchestrates the process of fetching, filtering, and aggregating assay
    data from PubChem for a predefined list of enzymes.

    This method iteratively queries PubChem for assay data corresponding
    to each enzyme specified in the `enzyme_list` attribute during class
    initialization. It performs the following steps for each enzyme:
    1. Constructs a query URL and fetches assay data from PubChem.
    2. Filters the fetched data based on predefined criteria
    (e.g., containing specific substrings in the assay name).
    3. Aggregates the filtered data into a single pandas DataFrame.
    4. Identifies enzymes for which data could not be fetched or were
    excluded based on filtering criteria, logging their names.

    The final aggregated DataFrame, containing assay data for all successfully
    processed enzymes, is then saved to a CSV file. This method facilitates
    the extraction and preprocessing of chemical assay data for further
    analysis or integration into knowledge graphs.

    Note:
        - This method relies on the successful response from PubChem
        for each enzyme query.
        - Enzymes with no available data or failing to meet the filtering
        criteria are excluded from the final DataFrame.
        - The output CSV file is saved in the current working directory
        with the name 'Data/AllDataConnected.csv'.

    Returns:
        pd.DataFrame: A DataFrame containing the aggregated and filtered
        assay data for the specified enzymes. Columns in the DataFrame
        correspond to the assay data fields returned by PubChem, subject to
        the filtering criteria applied within this method.

    Raises:
        requests.RequestException: If there is an issue with fetching data
        from PubChem, such as a network problem or an invalid response.

    Example:
        Assuming `enzyme_list` was set to ['CYP2D6', 'CYP3A4'] during
        class initialization:

        >>> extractor = NodePropertiesExtractor(['CYP2D6', 'CYP3A4'])
        >>> extractor.create_data_directories()
        >>> result_df = extractor.run()
        >>> print(result_df.head())

        This will fetch and process assay data for 'CYP2D6' and 'CYP3A4',
        returning a DataFrame with the processed data.
    """

    # Initialize an empty list to store enzymes with successful responses
    enzymes_with_response = []

    # Keep a copy of the original list to identify removed enzymes later
    original_enzyme_list = self.enzyme_list.copy()

    for enzyme in self.enzyme_list:
        # Formulate the URL
        url = f"{self._base_url}/{enzyme}/concise/CSV"

        try:
            response = requests.get(url)
            # Check for a successful response (status code 200)
            if response.status_code == 200:
                enzymes_with_response.append(enzyme)  # Keep the enzyme in the new list
        except requests.RequestException:
            # If there's an exception, skip adding the enzyme to the new list
            pass

    # Update the enzyme list with only the enzymes that had a successful response
    self.enzyme_list = enzymes_with_response

    # Identify and print the removed enzymes
    removed_enzymes = [enzyme for enzyme in original_enzyme_list if enzyme not in enzymes_with_response]
    if removed_enzymes:
        logging.info("These enzymes were removed because their names aren't correct: %s",
                     ", ".join(removed_enzymes))

    df_list = self._process_enzymes(self.enzyme_list)
    df = self._concatenate_data(df_list)
    substrings_to_filter = ['CYP', 'Cytochrome']
    pattern = '|'.join(substrings_to_filter)
    df = df[df['Assay Name'].str.contains(pattern, case=False, na=False)]
    df.to_csv('Data/AllDataConnected.csv', index=False)
    return df

4. Node Data Processor

node_data_processor.py

This module provides the NodeDataProcessor class, which is responsible for preprocessing various types of node data (assays, proteins, genes, and compounds) for use in chemical knowledge graph construction. The preprocessing includes renaming columns, consolidating multiple files, and saving the processed data in a consistent format. This step ensures uniformity and ease of access for subsequent data analysis and integration processes.

Classes:

Name Description
NodeDataProcessor

Handles preprocessing of assay, protein, gene, and compound data.

Example Usage

processor = NodeDataProcessor(data_dir='path/to/data') processor.preprocess_assays() processor.preprocess_proteins() processor.preprocess_genes() processor.preprocess_compounds()

NodeDataProcessor

NodeDataProcessor is responsible for preprocessing various types of node data (assays, proteins, genes, and compounds) by renaming columns, consolidating multiple files, and saving the processed data. This preprocessing step is crucial for ensuring uniformity and ease of access in subsequent analysis and integration processes.

Attributes:

Name Type Description
data_dir str

The directory where the node data files are stored.

Methods:

Name Description
preprocess_assays

Processes and renames columns in assay data.

preprocess_proteins

Processes and renames columns in protein data.

preprocess_genes

Processes and renames columns in gene data.

preprocess_compounds

Consolidates and renames columns in compound data.

Source code in chemgraphbuilder/node_data_processor.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class NodeDataProcessor:
    """
    NodeDataProcessor is responsible for preprocessing various types of node data
    (assays, proteins, genes, and compounds) by renaming columns, consolidating
    multiple files, and saving the processed data. This preprocessing step is
    crucial for ensuring uniformity and ease of access in subsequent analysis
    and integration processes.

    Attributes:
        data_dir (str): The directory where the node data files are stored.

    Methods:
        preprocess_assays(): Processes and renames columns in assay data.
        preprocess_proteins(): Processes and renames columns in protein data.
        preprocess_genes(): Processes and renames columns in gene data.
        preprocess_compounds(): Consolidates and renames columns in compound data.
    """

    def __init__(self, data_dir: str):
        """
        Initializes the NodeDataProcessor with a directory path to manage the data files.

        Args:
            data_dir (str): The directory where the node data files are stored.
        """
        self.data_dir = data_dir


    def preprocess_assays(self):
        """
        Processes the assay data by renaming columns and saving the modified data back to disk.
        This method also handles visualization of assay data distributions if necessary.
        """
        df = pd.read_csv(f'{self.data_dir}/Nodes/Assay_Properties.csv')
        df.rename(columns={"AID": "AssayID", "Assay Type": "AssayType",
                           "Activity Name": "AssayActivityName", "SourceID": "AssaySourceID",
                           "SourceName": "AssaySourceName", "Name": "AssayName",
                           "Description": "AssayDescription"}, inplace=True)
        df.to_csv(f'{self.data_dir}/Nodes/Assay_Properties_Processed.csv', index=False)


    def preprocess_proteins(self):
        """
        Processes the protein data by renaming columns and saving the processed data.
        This method simplifies access to protein data for downstream analysis.
        """
        df = pd.read_csv(f'{self.data_dir}/Nodes/Protein_Properties.csv')
        df.rename(columns={"RefSeq Accession": "ProteinRefSeqAccession",
                           "Description": "ProteinDescription"}, inplace=True)
        df.to_csv(f'{self.data_dir}/Nodes/Protein_Properties_Processed.csv', index=False)


    def preprocess_genes(self):
        """
        Processes gene data by renaming columns and changing data types for specific fields.
        The processed data is saved for further use in gene-related analyses.
        """
        df = pd.read_csv(f'{self.data_dir}/Nodes/Gene_Properties.csv')
        df.rename(columns={"Symbol": "GeneSymbol", "Taxonomy ID": "TaxonomyID",
                           "Synonyms": "GeneSynonyms"}, inplace=True)
        df['GeneID'] = df['GeneID'].astype('Int64')
        df['TaxonomyID'] = df['TaxonomyID'].astype('Int64')
        df.to_csv(f'{self.data_dir}/Nodes/Gene_Properties_Processed.csv', index=False)


    def preprocess_compounds(self):
        """
        Concatenates multiple CSV files containing compound data into a single file,
        renames columns for uniformity, and saves the consolidated data. This method
        facilitates easier management and analysis of compound data.
        """
        path = f'{self.data_dir}/Nodes/Compound_Properties'
        all_csv_files = glob.glob(path + "/*.csv")
        first_file = True
        output_file = f'Data/Nodes/Compound_Properties.csv'

        with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
            for file in all_csv_files:
                with open(file, 'r', newline='', encoding='utf-8') as f_in:
                    header = f_in.readline()
                    if first_file:
                        f_out.write(header)
                        first_file = False
                    for line in f_in:
                        f_out.write(line)

        df = pd.read_csv(output_file)
        df.rename(columns={"CID": "CompoundID", "Title": "CompoundName"}, inplace=True)
        df.to_csv(f"{output_file.replace('.csv', '_Processed')}", index=False)

__init__(data_dir)

Initializes the NodeDataProcessor with a directory path to manage the data files.

Parameters:

Name Type Description Default
data_dir str

The directory where the node data files are stored.

required
Source code in chemgraphbuilder/node_data_processor.py
46
47
48
49
50
51
52
53
def __init__(self, data_dir: str):
    """
    Initializes the NodeDataProcessor with a directory path to manage the data files.

    Args:
        data_dir (str): The directory where the node data files are stored.
    """
    self.data_dir = data_dir

preprocess_assays()

Processes the assay data by renaming columns and saving the modified data back to disk. This method also handles visualization of assay data distributions if necessary.

Source code in chemgraphbuilder/node_data_processor.py
56
57
58
59
60
61
62
63
64
65
66
def preprocess_assays(self):
    """
    Processes the assay data by renaming columns and saving the modified data back to disk.
    This method also handles visualization of assay data distributions if necessary.
    """
    df = pd.read_csv(f'{self.data_dir}/Nodes/Assay_Properties.csv')
    df.rename(columns={"AID": "AssayID", "Assay Type": "AssayType",
                       "Activity Name": "AssayActivityName", "SourceID": "AssaySourceID",
                       "SourceName": "AssaySourceName", "Name": "AssayName",
                       "Description": "AssayDescription"}, inplace=True)
    df.to_csv(f'{self.data_dir}/Nodes/Assay_Properties_Processed.csv', index=False)

preprocess_compounds()

Concatenates multiple CSV files containing compound data into a single file, renames columns for uniformity, and saves the consolidated data. This method facilitates easier management and analysis of compound data.

Source code in chemgraphbuilder/node_data_processor.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def preprocess_compounds(self):
    """
    Concatenates multiple CSV files containing compound data into a single file,
    renames columns for uniformity, and saves the consolidated data. This method
    facilitates easier management and analysis of compound data.
    """
    path = f'{self.data_dir}/Nodes/Compound_Properties'
    all_csv_files = glob.glob(path + "/*.csv")
    first_file = True
    output_file = f'Data/Nodes/Compound_Properties.csv'

    with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
        for file in all_csv_files:
            with open(file, 'r', newline='', encoding='utf-8') as f_in:
                header = f_in.readline()
                if first_file:
                    f_out.write(header)
                    first_file = False
                for line in f_in:
                    f_out.write(line)

    df = pd.read_csv(output_file)
    df.rename(columns={"CID": "CompoundID", "Title": "CompoundName"}, inplace=True)
    df.to_csv(f"{output_file.replace('.csv', '_Processed')}", index=False)

preprocess_genes()

Processes gene data by renaming columns and changing data types for specific fields. The processed data is saved for further use in gene-related analyses.

Source code in chemgraphbuilder/node_data_processor.py
80
81
82
83
84
85
86
87
88
89
90
def preprocess_genes(self):
    """
    Processes gene data by renaming columns and changing data types for specific fields.
    The processed data is saved for further use in gene-related analyses.
    """
    df = pd.read_csv(f'{self.data_dir}/Nodes/Gene_Properties.csv')
    df.rename(columns={"Symbol": "GeneSymbol", "Taxonomy ID": "TaxonomyID",
                       "Synonyms": "GeneSynonyms"}, inplace=True)
    df['GeneID'] = df['GeneID'].astype('Int64')
    df['TaxonomyID'] = df['TaxonomyID'].astype('Int64')
    df.to_csv(f'{self.data_dir}/Nodes/Gene_Properties_Processed.csv', index=False)

preprocess_proteins()

Processes the protein data by renaming columns and saving the processed data. This method simplifies access to protein data for downstream analysis.

Source code in chemgraphbuilder/node_data_processor.py
69
70
71
72
73
74
75
76
77
def preprocess_proteins(self):
    """
    Processes the protein data by renaming columns and saving the processed data.
    This method simplifies access to protein data for downstream analysis.
    """
    df = pd.read_csv(f'{self.data_dir}/Nodes/Protein_Properties.csv')
    df.rename(columns={"RefSeq Accession": "ProteinRefSeqAccession",
                       "Description": "ProteinDescription"}, inplace=True)
    df.to_csv(f'{self.data_dir}/Nodes/Protein_Properties_Processed.csv', index=False)

5. Add Graph Nodes

Module for adding node data from CSV files to a Neo4j database.

This module provides a class and methods to read node data from CSV files and add them to a Neo4j database, including creating uniqueness constraints and generating Cypher queries.

AddGraphNodes

Bases: Neo4jBase

A class used to add node data from a CSV file or a directory of CSV files to a Neo4j database.

Methods:

create_uniqueness_constraint(driver, label, unique_property): Create a uniqueness constraint for the unique property of nodes in Neo4j. generate_cypher_queries(node_dict, label, unique_property): Generate Cypher queries to update nodes in Neo4j based on the data from the CSV file. execute_queries(queries): Execute a list of provided Cypher queries against the Neo4j database. read_csv_file(file_path, unique_property): Read data from a CSV file and extract node properties. combine_csv_files(input_directory): Combine multiple CSV files with the same columns into a single DataFrame. process_and_add_nodes(file_path, label, unique_property): Process the CSV file and add node data to the Neo4j database. process_and_add_nodes_from_directory(directory_path, label, unique_property): Combine CSV files from a directory and add node data to the Neo4j database.

Source code in chemgraphbuilder/add_graph_nodes.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
class AddGraphNodes(Neo4jBase):
    """
    A class used to add node data from a CSV file or a directory of CSV files to a Neo4j database.

    Methods:
    --------
    create_uniqueness_constraint(driver, label, unique_property):
        Create a uniqueness constraint for the unique property of nodes in Neo4j.
    generate_cypher_queries(node_dict, label, unique_property):
        Generate Cypher queries to update nodes in Neo4j based on the data from the CSV file.
    execute_queries(queries):
        Execute a list of provided Cypher queries against the Neo4j database.
    read_csv_file(file_path, unique_property):
        Read data from a CSV file and extract node properties.
    combine_csv_files(input_directory):
        Combine multiple CSV files with the same columns into a single DataFrame.
    process_and_add_nodes(file_path, label, unique_property):
        Process the CSV file and add node data to the Neo4j database.
    process_and_add_nodes_from_directory(directory_path, label, unique_property):
        Combine CSV files from a directory and add node data to the Neo4j database.
    """

    def __init__(self, driver):
        """
        Initializes the AddGraphNodes class with a Neo4j driver.

        Parameters:
        -----------
        driver : neo4j.GraphDatabase.driver
            A driver instance to connect to the Neo4j database.
        """
        super().__init__()
        self.driver = driver
        self.logger.info("AddGraphNodes class initialized.")

    @staticmethod
    def create_uniqueness_constraint(driver, label, unique_property):
        """
        Create a uniqueness constraint for the unique property of nodes in Neo4j.

        Parameters:
        -----------
        driver : neo4j.GraphDatabase.driver
            A driver instance to connect to the Neo4j database.
        label : str
            The label of the node.
        unique_property : str
            The unique property of the node.
        """
        constraint_query = (
            f"CREATE CONSTRAINT IF NOT EXISTS FOR (n:{label}) "
            f"REQUIRE n.{unique_property} IS UNIQUE"
        )
        with driver.session() as session:
            try:
                session.run(constraint_query)
                logging.info(
                    "Uniqueness constraint created successfully on %s property of %s nodes.",
                    unique_property, label)
            except Exception as e:
                logging.error("Failed to create uniqueness constraint: %s", e)

    @staticmethod
    def _generate_property_string(value):
        if isinstance(value, (int, float)):
            return value
        try:
            return float(value)
        except (TypeError, ValueError):
            escaped_value = str(value).replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"').replace("\n", "\\n")
            return f"'{escaped_value}'"


    def generate_cypher_queries(self, node_dict, label, unique_property):
        """
        Generate Cypher queries for updating Neo4j based on the provided node data dictionary.

        Parameters:
        -----------
        node_dict : dict
            A dictionary with unique identifiers as keys and node data as values.
        label : str
            The label of the node.
        unique_property : str
            The unique property of the node.

        Yields:
        -------
        str
            A Cypher query string.
        """
        # Create an index for the unique_property
        create_index_query = f"CREATE INDEX IF NOT EXISTS FOR (n:{label}) ON (n.{unique_property})"
        self.logger.debug(create_index_query)
        yield create_index_query

        for unique_id, properties in node_dict.items():
            unique_id = f'"{unique_id}"' if isinstance(unique_id, str) else unique_id
            query = f"MERGE (n:{label} {{{unique_property}: {unique_id}}})"
            set_clauses = [
                f"n.{prop.replace(' ', '')} = {self._generate_property_string(value)}"
                for prop, value in properties.items()
            ]
            if set_clauses:
                query += " SET " + ", ".join(set_clauses)
            else:
                query += ";"
            self.logger.debug(query)
            yield query
        self.logger.info("Cypher queries generated successfully.")

    def execute_queries(self, queries):
        """
        Execute the provided list of Cypher queries against the Neo4j database.

        Parameters:
        -----------
        queries : list
            A list of Cypher query strings to execute.
        """
        self.logger.info("Executing Cypher queries...")
        with self.driver.session() as session:
            self.logger.info("Executing Cypher queries Started....")
            for query in queries:
                try:
                    session.run(query)
                except Exception as e:
                    self.logger.error("Failed to execute query: %s", e)
        self.logger.info("All queries executed.")

    def read_csv_file(self, file_path, unique_property):
        """
        Read data from a CSV file and extract node properties.

        Parameters:
        -----------
        file_path : str
            The path to the CSV file.
        unique_property : str
            The column name that serves as the unique identifier for the nodes.

        Returns:
        --------
        dict
            A dictionary with unique identifiers as keys and extracted data as values.
        """
        self.logger.info("Reading data from CSV file: %s", file_path)
        df = pd.read_csv(file_path).dropna(subset=[unique_property], how='any')
        node_dict = {
            row[unique_property]: row.drop(labels=[unique_property]).to_dict()
            for _, row in df.iterrows()
        }
        self.logger.info("Successfully read data for %d nodes from CSV.", len(node_dict))
        return node_dict

    def combine_csv_files(self, input_directory):
        """
        Combine multiple CSV files with the same columns into a single DataFrame.

        Parameters:
        -----------
        input_directory : str
            The directory containing the CSV files to be combined.

        Returns:
        --------
        DataFrame
            A combined DataFrame containing data from all the CSV files.
        """
        self.logger.info("Combining CSV files from directory: %s", input_directory)
        dfs = [
            pd.read_csv(os.path.join(input_directory, file))
            for file in os.listdir(input_directory)
            if file.endswith(".csv")
        ]
        combined_df = pd.concat(dfs, ignore_index=True)
        self.logger.info("Successfully combined %d CSV files.", len(dfs))
        return combined_df

    def process_and_add_nodes(self, file_path, label, unique_property):
        """
        Process the CSV file and add node data to the Neo4j database.

        Parameters:
        -----------
        file_path : str
            The path to the CSV file.
        label : str
            The label of the node.
        unique_property : str
            The unique property of the node.
        """
        self.logger.info("Processing and adding nodes from file: %s", file_path)
        node_dict = self.read_csv_file(file_path, unique_property)
        queries = list(self.generate_cypher_queries(node_dict, label, unique_property))
        self.execute_queries(queries)
        self.logger.info("Successfully processed and added nodes from file: %s", file_path)

    def process_and_add_nodes_from_directory(self, directory_path, label, unique_property):
        """
        Combine CSV files from a directory and add node data to the Neo4j database.

        Parameters:
        -----------
        directory_path : str
            The path to the directory containing the CSV files.
        label : str
            The label of the node.
        unique_property : str
            The unique property of the node.
        """
        self.logger.info("Processing and adding nodes from directory: %s", directory_path)
        combined_df = self.combine_csv_files(directory_path)
        temp_file = os.path.join(directory_path, "combined_temp.csv")
        combined_df.to_csv(temp_file, index=False)
        self.process_and_add_nodes(temp_file, label, unique_property)
        os.remove(temp_file)
        self.logger.info("Successfully processed and added nodes from directory: %s",
                         directory_path)

    def public_generate_property_string(self, value):
        """
        Public method to access the protected _generate_property_string method for testing.

        Parameters:
        -----------
        value : Any
            The value to be formatted.

        Returns:
        --------
        str
            The formatted property string.
        """
        return self._generate_property_string(value)

__init__(driver)

Initializes the AddGraphNodes class with a Neo4j driver.

Parameters:

driver : neo4j.GraphDatabase.driver A driver instance to connect to the Neo4j database.

Source code in chemgraphbuilder/add_graph_nodes.py
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(self, driver):
    """
    Initializes the AddGraphNodes class with a Neo4j driver.

    Parameters:
    -----------
    driver : neo4j.GraphDatabase.driver
        A driver instance to connect to the Neo4j database.
    """
    super().__init__()
    self.driver = driver
    self.logger.info("AddGraphNodes class initialized.")

combine_csv_files(input_directory)

Combine multiple CSV files with the same columns into a single DataFrame.

Parameters:

input_directory : str The directory containing the CSV files to be combined.

Returns:

DataFrame A combined DataFrame containing data from all the CSV files.

Source code in chemgraphbuilder/add_graph_nodes.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def combine_csv_files(self, input_directory):
    """
    Combine multiple CSV files with the same columns into a single DataFrame.

    Parameters:
    -----------
    input_directory : str
        The directory containing the CSV files to be combined.

    Returns:
    --------
    DataFrame
        A combined DataFrame containing data from all the CSV files.
    """
    self.logger.info("Combining CSV files from directory: %s", input_directory)
    dfs = [
        pd.read_csv(os.path.join(input_directory, file))
        for file in os.listdir(input_directory)
        if file.endswith(".csv")
    ]
    combined_df = pd.concat(dfs, ignore_index=True)
    self.logger.info("Successfully combined %d CSV files.", len(dfs))
    return combined_df

create_uniqueness_constraint(driver, label, unique_property) staticmethod

Create a uniqueness constraint for the unique property of nodes in Neo4j.

Parameters:

driver : neo4j.GraphDatabase.driver A driver instance to connect to the Neo4j database. label : str The label of the node. unique_property : str The unique property of the node.

Source code in chemgraphbuilder/add_graph_nodes.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
@staticmethod
def create_uniqueness_constraint(driver, label, unique_property):
    """
    Create a uniqueness constraint for the unique property of nodes in Neo4j.

    Parameters:
    -----------
    driver : neo4j.GraphDatabase.driver
        A driver instance to connect to the Neo4j database.
    label : str
        The label of the node.
    unique_property : str
        The unique property of the node.
    """
    constraint_query = (
        f"CREATE CONSTRAINT IF NOT EXISTS FOR (n:{label}) "
        f"REQUIRE n.{unique_property} IS UNIQUE"
    )
    with driver.session() as session:
        try:
            session.run(constraint_query)
            logging.info(
                "Uniqueness constraint created successfully on %s property of %s nodes.",
                unique_property, label)
        except Exception as e:
            logging.error("Failed to create uniqueness constraint: %s", e)

execute_queries(queries)

Execute the provided list of Cypher queries against the Neo4j database.

Parameters:

queries : list A list of Cypher query strings to execute.

Source code in chemgraphbuilder/add_graph_nodes.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def execute_queries(self, queries):
    """
    Execute the provided list of Cypher queries against the Neo4j database.

    Parameters:
    -----------
    queries : list
        A list of Cypher query strings to execute.
    """
    self.logger.info("Executing Cypher queries...")
    with self.driver.session() as session:
        self.logger.info("Executing Cypher queries Started....")
        for query in queries:
            try:
                session.run(query)
            except Exception as e:
                self.logger.error("Failed to execute query: %s", e)
    self.logger.info("All queries executed.")

generate_cypher_queries(node_dict, label, unique_property)

Generate Cypher queries for updating Neo4j based on the provided node data dictionary.

Parameters:

node_dict : dict A dictionary with unique identifiers as keys and node data as values. label : str The label of the node. unique_property : str The unique property of the node.

Yields:

str A Cypher query string.

Source code in chemgraphbuilder/add_graph_nodes.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def generate_cypher_queries(self, node_dict, label, unique_property):
    """
    Generate Cypher queries for updating Neo4j based on the provided node data dictionary.

    Parameters:
    -----------
    node_dict : dict
        A dictionary with unique identifiers as keys and node data as values.
    label : str
        The label of the node.
    unique_property : str
        The unique property of the node.

    Yields:
    -------
    str
        A Cypher query string.
    """
    # Create an index for the unique_property
    create_index_query = f"CREATE INDEX IF NOT EXISTS FOR (n:{label}) ON (n.{unique_property})"
    self.logger.debug(create_index_query)
    yield create_index_query

    for unique_id, properties in node_dict.items():
        unique_id = f'"{unique_id}"' if isinstance(unique_id, str) else unique_id
        query = f"MERGE (n:{label} {{{unique_property}: {unique_id}}})"
        set_clauses = [
            f"n.{prop.replace(' ', '')} = {self._generate_property_string(value)}"
            for prop, value in properties.items()
        ]
        if set_clauses:
            query += " SET " + ", ".join(set_clauses)
        else:
            query += ";"
        self.logger.debug(query)
        yield query
    self.logger.info("Cypher queries generated successfully.")

process_and_add_nodes(file_path, label, unique_property)

Process the CSV file and add node data to the Neo4j database.

Parameters:

file_path : str The path to the CSV file. label : str The label of the node. unique_property : str The unique property of the node.

Source code in chemgraphbuilder/add_graph_nodes.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def process_and_add_nodes(self, file_path, label, unique_property):
    """
    Process the CSV file and add node data to the Neo4j database.

    Parameters:
    -----------
    file_path : str
        The path to the CSV file.
    label : str
        The label of the node.
    unique_property : str
        The unique property of the node.
    """
    self.logger.info("Processing and adding nodes from file: %s", file_path)
    node_dict = self.read_csv_file(file_path, unique_property)
    queries = list(self.generate_cypher_queries(node_dict, label, unique_property))
    self.execute_queries(queries)
    self.logger.info("Successfully processed and added nodes from file: %s", file_path)

process_and_add_nodes_from_directory(directory_path, label, unique_property)

Combine CSV files from a directory and add node data to the Neo4j database.

Parameters:

directory_path : str The path to the directory containing the CSV files. label : str The label of the node. unique_property : str The unique property of the node.

Source code in chemgraphbuilder/add_graph_nodes.py
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def process_and_add_nodes_from_directory(self, directory_path, label, unique_property):
    """
    Combine CSV files from a directory and add node data to the Neo4j database.

    Parameters:
    -----------
    directory_path : str
        The path to the directory containing the CSV files.
    label : str
        The label of the node.
    unique_property : str
        The unique property of the node.
    """
    self.logger.info("Processing and adding nodes from directory: %s", directory_path)
    combined_df = self.combine_csv_files(directory_path)
    temp_file = os.path.join(directory_path, "combined_temp.csv")
    combined_df.to_csv(temp_file, index=False)
    self.process_and_add_nodes(temp_file, label, unique_property)
    os.remove(temp_file)
    self.logger.info("Successfully processed and added nodes from directory: %s",
                     directory_path)

public_generate_property_string(value)

Public method to access the protected _generate_property_string method for testing.

Parameters:

value : Any The value to be formatted.

Returns:

str The formatted property string.

Source code in chemgraphbuilder/add_graph_nodes.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def public_generate_property_string(self, value):
    """
    Public method to access the protected _generate_property_string method for testing.

    Parameters:
    -----------
    value : Any
        The value to be formatted.

    Returns:
    --------
    str
        The formatted property string.
    """
    return self._generate_property_string(value)

read_csv_file(file_path, unique_property)

Read data from a CSV file and extract node properties.

Parameters:

file_path : str The path to the CSV file. unique_property : str The column name that serves as the unique identifier for the nodes.

Returns:

dict A dictionary with unique identifiers as keys and extracted data as values.

Source code in chemgraphbuilder/add_graph_nodes.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def read_csv_file(self, file_path, unique_property):
    """
    Read data from a CSV file and extract node properties.

    Parameters:
    -----------
    file_path : str
        The path to the CSV file.
    unique_property : str
        The column name that serves as the unique identifier for the nodes.

    Returns:
    --------
    dict
        A dictionary with unique identifiers as keys and extracted data as values.
    """
    self.logger.info("Reading data from CSV file: %s", file_path)
    df = pd.read_csv(file_path).dropna(subset=[unique_property], how='any')
    node_dict = {
        row[unique_property]: row.drop(labels=[unique_property]).to_dict()
        for _, row in df.iterrows()
    }
    self.logger.info("Successfully read data for %d nodes from CSV.", len(node_dict))
    return node_dict

6. Relationship Properties Extractor

This module defines the RelationshipPropertiesExtractor class, which is responsible for extracting and analyzing relationship properties among compounds, genes, and assays from the PubChem database.

The class facilitates the retrieval of complex relational data between chemical entities, enabling detailed analysis of biochemical interactions and properties. The extracted data is ideal for constructing knowledge graphs, supporting drug discovery, and understanding genetic influences on compound behavior.

Classes:

Name Description
- RelationshipPropertiesExtractor

A class to extract and analyze relationship properties from PubChem.

Usage Example

extractor = RelationshipPropertiesExtractor() extractor.assay_compound_relationship("Data/AllDataCollected.csv") This example fetches assay-compound relationship data for specified assays and saves the data to CSV files.

Note

Ensure network access to the PubChem API for data retrieval.

RelationshipPropertiesExtractor

Extracts and analyzes relationship properties among compounds, genes, and assays from the PubChem database.

This class facilitates the retrieval of complex relational data between chemical entities, enabling detailed analysis of biochemical interactions and properties. The extracted data is ideal for constructing knowledge graphs, supporting drug discovery, and understanding genetic influences on compound behavior.

Methods within the class are tailored to query specific relationship types from PubChem, including compound-assay relationships, compound co-occurrences, and compound transformations influenced by genes. Data fetched from PubChem is processed and saved in structured formats (CSV files), ready for further analysis or database integration.

Attributes:

Name Type Description
session Session

Session object to persist certain parameters

Usage

extractor = RelationshipPropertiesExtractor() extractor.assay_compound_relationship("Data/AllDataCollected.csv") This example fetches assay-compound relationship data for specified assays and saves the data to CSV files.

Source code in chemgraphbuilder/relationship_properties_extractor.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
class RelationshipPropertiesExtractor:
    """
    Extracts and analyzes relationship properties among compounds, genes, and
    assays from the PubChem database.

    This class facilitates the retrieval of complex relational data between
    chemical entities, enabling detailed analysis of biochemical interactions
    and properties. The extracted data is ideal for constructing knowledge
    graphs, supporting drug discovery, and understanding genetic influences
    on compound behavior.

    Methods within the class are tailored to query specific relationship types
    from PubChem, including compound-assay relationships, compound co-occurrences,
    and compound transformations influenced by genes. Data fetched from PubChem
    is processed and saved in structured formats (CSV files), ready for further
    analysis or database integration.

    Attributes:
        session (requests.Session): Session object to persist certain parameters
        across requests.

    Usage:
        >>> extractor = RelationshipPropertiesExtractor()
        >>> extractor.assay_compound_relationship("Data/AllDataCollected.csv")
        This example fetches assay-compound relationship data for specified
        assays and saves the data to CSV files.
    """

    def __init__(self):
        """Initializes a RelationshipPropertiesExtractor with a Requests session
         for efficient network calls."""
        self.session = requests.Session()


    def _send_request(self, url, max_retries=5, initial_wait=1):
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                return response
            except requests.HTTPError as e:
                if response.status_code == 503:
                    wait = initial_wait * (2 ** attempt)
                    print(f"Server busy or under maintenance. Retrying in {wait} seconds...")
                    time.sleep(wait)
                else:
                    print(f"HTTP Error: {e}")
                    break  # Break the loop for non-503 HTTP errors
            except requests.RequestException as e:
                print(f"Request Exception: {e}")
                wait = initial_wait * (2 ** attempt)
                print(f"Network error. Retrying in {wait} seconds...")
                time.sleep(wait)
        return None  # Return None to indicate failure after all retries


    def fetch_data_for_aid(self, aid, columns_to_remove):
        """
        Fetches and processes assay data for a specified Assay ID (AID) from the
        PubChem database, preparing it for analysis or further processing.

        This method queries the PubChem database for assay data associated with
        a given AID. It constructs the query URL, sends the request using a
        previously established session, and processes the response. The response
        is expected to be in CSV format, which this method reads into a pandas
        DataFrame. Specific columns can be removed from this DataFrame based on
        the requirements for analysis. This allows for the customization of
        the fetched data, making it easier to work with specific datasets.

        If the request is successful and the data is fetched without issues,
        it undergoes initial processing to remove unwanted columns as specified
        by the 'columns_to_remove' parameter. In case of an error during the
        data fetching or processing (e.g., issues with parsing the CSV data),
        appropriate error messages are logged, and an empty DataFrame is
        returned as a fallback.

        Parameters:
            aid (int): The assay ID for which data is to be fetched. This ID is
            used to construct the query URL to the PubChem database.
            columns_to_remove (list of str): A list of column names that should
            be removed from the fetched DataFrame. This allows for the exclusion
            of data that might not be relevant to the subsequent analysis or
            processing steps.

        Returns:
            pandas.DataFrame: A DataFrame containing the processed data
            associated with the given AID. The DataFrame will exclude columns
            listed in 'columns_to_remove'. If the data fetching fails or if
            an error occurs during processing, an empty DataFrame is returned.

        Raises:
            requests.RequestException: If an error occurs during the HTTP request
            to the PubChem API. This includes scenarios such as timeout issues,
            non-200 status codes, or network-related errors. The exception is
            handled internally with logging, but it's important to be aware of
            its possibility.
            pd.errors.ParserError: If an error occurs while parsing the CSV
            response from PubChem into a DataFrame. This could happen due to
            malformed data or unexpected changes in the response format.
            Like with RequestException, this error is logged and results in
            the return of an empty DataFrame.

        Example:
            >>> extractor = RelationshipPropertiesExtractor()
            >>> processed_data_df = extractor.fetch_data_for_aid(12345, ['UnwantedColumn1', 'UnwantedColumn2'])
            >>> print(processed_data_df.head())
            This example demonstrates how to fetch and process assay data for
            the assay with ID 12345, removing 'UnwantedColumn1' and
            'UnwantedColumn2' from the resulting DataFrame. The first few rows
            of the processed DataFrame are printed as an output.

        Note:
            - This method is part of a class that requires a valid session with
            the PubChem API. Ensure that the class is properly initialized and that
            the session is active.
            - The removal of columns is an optional step and can be customized
            based on the analysis needs. If no columns need to be removed, pass an
            empty list as 'columns_to_remove'.
        """
        url = (
            "https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?"
            "query=download&record_type=datatable&actvty="
            f"all&response_type=display&aid={aid}"
        )

        response = self._send_request(url)
        if response and response.status_code == 200:
            try:
                compound_df = pd.read_csv(StringIO(response.text), sep=',')
                # Drop specified columns and process column names in-place for memory efficiency
                columns_to_remove_set = set(columns_to_remove)
                existing_columns_set = set(compound_df.columns)
                columns_to_actually_remove = list(columns_to_remove_set & existing_columns_set)
                compound_df.drop(columns=columns_to_actually_remove,
                                 errors='ignore', inplace=True)
                compound_df.rename(columns=lambda x: x.replace('PUBCHEM_', '') if x.startswith('PUBCHEM_') else x, inplace=True)

                # compound_df.drop(columns=[col for col in columns_to_remove if col in compound_df.columns], errors='ignore', inplace=True)
                # compound_df.columns = [col.replace('PUBCHEM_', '') if col.startswith('PUBCHEM_') else col for col in compound_df.columns]
                compound_df['AID'] = aid
                return compound_df
            except pd.errors.ParserError as e:
                logging.error(f"CSV parsing failed for AID {aid}: {e}")
        else:
            logging.error(f"Failed to fetch data for AID {aid}. Status code: {response.status_code if response else 'No Response'}")
        return pd.DataFrame()  # Return an empty DataFrame in case of failure



    def _process_dataframe(self, df, aid, columns_to_remove):
        """
        Processes the DataFrame by removing specified columns and renaming others.

        Parameters:
            df (pandas.DataFrame): The DataFrame to be processed.
            aid (int): The assay ID associated with the DataFrame.
            columns_to_remove (list of str): Columns to be removed from the DataFrame.
        """
        # Drop unnecessary columns efficiently
        columns_to_remove_set = set(columns_to_remove)
        df = df.drop(columns=list(columns_to_remove_set.intersection(df.columns)), errors='ignore')

        # Efficiently rename columns that start with 'PUBCHEM_'
        df.columns = [col.replace('PUBCHEM_', '') if col.startswith('PUBCHEM_') else col for col in df.columns]
        df['AID'] = aid


    def assay_compound_relationship(self, assays_data, start_chunk=0):
        """
        Processes and stores relationships between assays and compounds based
        on assay data from PubChem.

        Parameters:
            assays_data (str): Path to a CSV file containing assay IDs (AIDs).
            start_chunk (int): The starting index for processing chunks.
        """
        for chunk_idx, chunk in enumerate(pd.read_csv(assays_data, chunksize=100)):
            if chunk_idx >= start_chunk:
                columns_to_remove = ['PUBCHEM_RESULT_TAG', 'PUBCHEM_SID', 'PUBCHEM_EXT_DATASOURCE_SMILES']
                output_dir = 'Data/Relationships/Assay_Compound_Relationship'

                for aid in chunk['AID']:
                    if not os.path.exists(f'{output_dir}/AID_{aid}.csv'):
                        df = self.fetch_data_for_aid(aid, columns_to_remove)
                        if not df.empty:
                            if not os.path.exists(output_dir):
                                os.makedirs(output_dir)
                            df.to_csv(f'{output_dir}/AID_{aid}.csv', index=False)
                logging.info(f"Processed chunk {chunk_idx} for assay-compound relationships.")
            else:
                logging.info(f"No More Chunck to Process.")



    def _write_to_csv(self, df, filename):
        """
        Writes a DataFrame to a CSV file.
        """
        df.to_csv(filename, index=False)


    def assay_gene_relationship(self, main_data):
        """
        Extracts and saves relationships between assays and proteins from the
        specified dataset.

        This method processes assay data to identify relationships between
        assays and their target proteins. It selects relevant columns from the
        input data, removes duplicates to ensure unique relationships, and saves
        the cleaned data to a CSV file for further analysis or integration into
        knowledge graphs.

        Parameters:
            main_data (str): Path to the CSV file containing the main data. The
            file should include columns for 'AID' (Assay ID), 'Target GeneID',
            and 'Activity Name'.

        Returns:
            pandas.DataFrame: A DataFrame containing the unique relationships
            between assays and proteins, including the assay ID, target gene ID,
            and activity name.

        Side Effects:
            - Writes a CSV file to 'Data/Relationships/Assay_Gene_Relationship.csv',
            containing the processed relationships data.
        """
        df = pd.read_csv(main_data)
        columns_to_select = ['AID', 'Target GeneID', 'Activity Name']
        df = df[columns_to_select]
        df = df.drop_duplicates(keep='first', ignore_index=True)
        df.to_csv(f'Data/Relationships/Assay_Gene_Relationship.csv', index=False)
        return df


    def gene_protein_relationship(self, main_data):
        """
        Extracts and saves relationships between genes and proteins based on
        the provided dataset.

        This method selects relevant columns to highlight the relationships
        between genes and their corresponding proteins.
        It removes duplicate entries to ensure that each relationship is
        represented uniquely and saves the resultant data to
        a CSV file. This facilitates easy integration of genetic data into
        knowledge bases or further analysis.

        Parameters:
            main_data (str): Path to the CSV file containing gene and protein data.
            Expected columns include 'Target GeneID' and 'Target Accession'.

        Returns:
            pandas.DataFrame: A DataFrame of unique gene-protein relationships,
            including gene ID and protein accession numbers.

        Side Effects:
            - Writes the processed data to 'Data/Gene_Protein_Relationship.csv'
            in a structured CSV format.
        """
        df = pd.read_csv(main_data)
        columns_to_select = ['Target GeneID', 'Target Accession']
        df = df[columns_to_select]
        df = df.drop_duplicates(keep='first', ignore_index=True)
        df.to_csv(f'Data/Relationships/Gene_Protein_Relationship.csv', index=False)
        return df


    def compound_gene_relationship(self, main_data):
        """
        Identifies and records relationships between compounds and proteins from
        the input data.

        This method focuses on extracting compound-protein interaction data,
        including activity outcomes and values. It selects
        pertinent columns, removes duplicate records, and sorts the data by
        Compound ID and Target Accession for clarity. The cleaned dataset is
        then saved to a CSV file, providing a structured view  of how compounds
        interact with various proteins, which can be critical for drug discovery
        and pharmacological research.

        Parameters:
            main_data (str): Path to the CSV file with compound and protein data.
            This file should contain columns for 'CID' (Compound ID),
            'Target Accession', 'Activity Outcome', 'Activity Name', and
            'Activity Value [uM]'.

        Returns:
            pandas.DataFrame: A DataFrame with processed compound-protein
            relationships, sorted and cleaned for direct analysis or database
            insertion.

        Side Effects:
            - Saves the processed relationships data to
            'Data/Relationships/Compound_Gene_Relationship.csv',
            facilitating easy access and integration.
        """
        df = pd.read_csv(main_data)
        columns_to_select = ['CID', 'Target GeneID', 'Target Accession',
                             'Activity Outcome', 'Activity Name',
                             'Activity Value [uM]']
        df = df[columns_to_select]
        df = df.drop_duplicates(keep='first', ignore_index=True)
        df = df.sort_values(['CID', 'Target Accession'])
        df.dropna(axis=0 , thresh=1, inplace=True) ###
        df.to_csv(f'Data/Relationships/Compound_Gene_Relationship.csv', index=False)
        return df


    def fetch_similar_cids(self, cid):
        """
        Fetches similar compound IDs (CIDs) from the PubChem database for a
        given compound ID (CID) using 2D similarity.

        This method queries the PubChem database to find compounds that are
        similar to the given CID based on 2D structural similarity.
        The similarity threshold is set to 95%, and a maximum of 100 similar
        CIDs are fetched. The response is parsed from XML format to extract
        the similar CIDs.

        Parameters:
            cid (int): The compound ID for which similar CIDs are to be fetched.

        Returns:
            tuple: A tuple containing the original CID and a list of similar
            CIDs. If an error occurs, the list of similar CIDs will be empty.

        Raises:
            Exception: Logs an error message with the original CID and the
            exception if the request to PubChem fails or if parsing the XML
            response encounters an error.

        Note:
            - The method utilizes the `requests` library for HTTP requests and
            `xml.etree.ElementTree` for XML parsing.
            - In case of a request failure or parsing error, the method logs
            the error and returns the original CID with an empty list,
            allowing the calling function to handle the exception as needed.
        """
        url = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/"
               f"fastsimilarity_2d/cid/{int(cid)}/cids/XML?Threshold=95&MaxRecords=100")
        try:
            response = requests.get(url)
            response.raise_for_status()
            xml_data = response.text

            # Parse XML data
            tree = ET.parse(io.StringIO(xml_data))
            root = tree.getroot()

            # Extracting CID values
            similar_cids = [element.text for element in root.findall('{http://pubchem.ncbi.nlm.nih.gov/pug_rest}CID')]
            return cid, similar_cids
        except Exception as e:
            logging.error(f"Error processing CID {cid}: {e}")
            return cid, []


    def process_chunk(self, chunk):
        """
        Processes a chunk of CIDs in parallel to fetch similar CIDs for each CID
        in the chunk.

        This method uses a ThreadPoolExecutor to send out concurrent requests for
        fetching similar CIDs for a list of CIDs.
        The number of worker threads is set to 5. Each CID's request is handled
        by `fetch_similar_cids` method.

        Parameters:
            chunk (list of int): A list of compound IDs (CIDs) to process in
            parallel.

        Returns:
            list of tuples: A list of tuples, each containing a CID and its
            corresponding list of similar CIDs.

        Side Effects:
            - Utilizes concurrent threads to speed up the fetching process.
            - May log errors if any occur during the fetching of similar CIDs
            for individual CIDs.
        """
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(self.fetch_similar_cids, cid) for cid in chunk]
            results = [future.result() for future in as_completed(futures)]
        return results


    def compound_similarity_relationship(self, main_data, start_chunk=0):
        """
        Identifies and records the similarity relationships between compounds
        based on a list of CIDs. The similarity is detrmined by the Tanimoto
        similarity coefficient with threshold 95% to ensure highe structural
        similarity.

        This method reads a CSV file containing compound data, filters compounds
        based on specific 'Target GeneID' values,
        and fetches similar CIDs for each compound. The compounds are processed
        in chunks to manage memory usage and improve efficiency. The results are
        saved into separate CSV files for each chunk.

        Parameters:
            main_data (str): Path to the CSV file containing the main compound data.
            start_chunk (int): The starting index for processing chunks.
        Note:
            - The method filters the main data for compounds associated with
            specific 'Target GeneID' values before fetching similar CIDs,
            optimizing the process for relevant compounds only.
            - The division of CIDs into chunks and concurrent processing helps
            in managing large datasets and utilizes parallelism for faster
            execution.
        """
        df = pd.read_csv(main_data)
        df = df[df['Target GeneID'].isin([1576, 1544, 1557, 1559, 1565])]
        df = df.dropna(subset=['CID'])
        IDs = df['CID'].unique().tolist()

        chunk_size=10000
        chunks = [IDs[i:i + chunk_size] for i in range(0, len(IDs), chunk_size)]

        for i, chunk in enumerate(chunks, start=0):
            if i >= start_chunk:
                chunk_results = self.process_chunk(chunk)
                chunk_df = pd.DataFrame(chunk_results, columns=['CID', 'Similar CIDs'])
                if not os.path.exists('Data/Relationships/Compound_Similarities'):
                    os.makedirs('Data/Relationships/Compound_Similarities')
                chunk_df.to_csv(f'Data/Relationships/Compound_Similarities/Chunk_{i}.csv', index=False)
                logging.info(f"Processed chunk {i} for compound similarity relationships.")


    def _fetch_chemical_neighbor_data(self, cid):
        """
        Fetches chemical-chemical relationship data for a given CID.

        Args:
            cid (int): The compound ID for which data is to be fetched.

        Returns:
            list: List of chemical-chemical relationship data.
        """
        cpd_cpd_url = ("https://pubchem.ncbi.nlm.nih.gov/link_db/link_db_server.cgi?format=JSON&type="
                       f"ChemicalNeighbor&operation=GetAllLinks&id_1={int(cid)}&response_type=display")
        # print('url', cpd_cpd_url)
        try:
            response = self._send_request(cpd_cpd_url)
            data = response.json()
            # print(data)
            return data.get('LinkDataSet', {}).get('LinkData', [])
        except Exception as e:
            logging.error(f"Failed to fetch chemical-chemical data for CID {cid}: {e}")
            return []


    def _fetch_chemical_gene_data(self, gid):
        """
        Fetches chemical-gene relationship data for a given CID.

        Args:
            cid (int): The compound ID for which data is to be fetched.

        Returns:
            list: List of chemical-gene relationship data.
        """
        cpd_gene_url = ("https://pubchem.ncbi.nlm.nih.gov/link_db/link_db_server.cgi?format=JSON&"
                        f"type=GeneSymbolChemicalNeighbor&operation=GetAllLinks&id_1={gid}&response_type=display")
        try:
            response = self._send_request(cpd_gene_url)
            data = response.json()
            return data.get('LinkDataSet', {}).get('LinkData', [])
        except Exception as e:
            logging.error(f"Failed to fetch chemical-gene data for CID {gid}: {e}")
            return []


    def _fetch_chemical_gene_interaction_data(self, gid):
        """
        Fetches chemical-gene relationship data for a given CID.

        Args:
            cid (int): The compound ID for which data is to be fetched.

        Returns:
            list: List of chemical-gene relationship data.
        """
        base_url = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi"
        query = {
            "download": "*",
            "collection": "consolidatedcompoundtarget",
            "order": ["cid,asc"],
            "start": 1,
            "limit": 10000000,
            "downloadfilename": f"pubchem_geneid_{int(gid)}_consolidatedcompoundtarget",
            "where": {
                "ands": [
                    {"geneid": f"{int(gid)}"}
                ]
            }
        }

        # Convert the query dictionary to a JSON string
        query_json_str = json.dumps(query)

        # URL encode the JSON string
        encoded_query = quote(query_json_str)

        # Construct the full URL
        url = f"{base_url}?infmt=json&outfmt=json&query={encoded_query}"

        try:
            response = self._send_request(url)
            data = response.json()
            return data
        except Exception as e:
            logging.error(f"Failed to fetch chemical-gene data for CID {gid}: {e}")
            return []


    def _write_data_to_csv(self, data, filename, filter_condition=None):
        """
        Writes given data to a CSV file, with optional filtering before saving.

        This method takes a list of dictionaries (data), converts it into a
        pandas DataFrame, and optionally filters the DataFrame based on
        specified conditions before writing the result to a CSV file. The
        filtering is performed on specified columns with their expected
        values provided in 'filter_condition'. This allows for selective
        data saving, especially useful when dealing with large datasets
        or when only a subset of data is needed for further processing
        or analysis.

        Parameters:
            data (list of dict): Data to be written to a CSV file. Each
            dictionary in the list represents a row in the DataFrame, with keys
            as column names and values as row values.
            filename (str): Path to the CSV file where the data will be saved.
            If the file exists, it will be overwritten.
            filter_condition (dict, optional): A dictionary specifying the
            columns to filter by and the values to include. Keys in the
            dictionary are column names, and values are lists of acceptable
            values for that column. Rows not meeting the filter condition are
            excluded from the final DataFrame to be saved.

        Side Effects:
            - Writes a CSV file to the given filename path. The file is overwritten
            if it already exists.
            - Logs a warning if a specified column for filtering is not found in
            the DataFrame.
        """

        df = pd.DataFrame(data)
        if filter_condition:
            for column, values in filter_condition.items():
                if column in df.columns:
                    df = df[df[column].isin(values)]
                else:
                    logging.warning(f"Column {column} not found in DataFrame.")
        if not df.empty:
            df.to_csv(filename, index=False)


    def compound_compound_cooccurrence(self, main_data, rate_limit=5):
        """
        Analyzes compound-compound co-occurrence relationships from the specified main data file and saves the results into structured CSV files.

        Args:
            main_data (str): Path to the main data file.
            rate_limit (int): The maximum number of requests per second.

        Returns:
            str: A message indicating the completion of data fetching and saving.
        """
        logging.info("Starting compound-compound co-occurrence data retrieval ...")
        start_time = timeit.default_timer()

        try:
            df = pd.read_csv(main_data)
            logging.info(f"Loaded data from {main_data}. Total rows: {len(df)}")
        except FileNotFoundError:
            logging.error(f"File not found: {main_data}")
            return "File not found."
        except pd.errors.EmptyDataError:
            logging.error(f"Empty data file: {main_data}")
            return "Empty data file."
        except Exception as e:
            logging.error(f"Error reading {main_data}: {e}")
            return "Error reading data file."

        compound_ids = df['CID'].dropna().unique().tolist()
        logging.info(f"Unique Compound IDs to process: {len(compound_ids)}")

        for compound_id in compound_ids:
            logging.info(f"Processing Compound ID {int(compound_id)}")
            try:
                data = self._fetch_chemical_neighbor_data(int(compound_id))
                filename = f"Data/Relationships/Cpd_Cpd_CoOccurrence/CID_{int(compound_id)}.csv"
                self._write_data_to_csv(data, filename)
                logging.info(f"Successfully wrote data for Compound ID {int(compound_id)} to {filename}")
            except Exception as e:
                logging.error(f"Error processing Compound ID {int(compound_id)}: {e}")
            time.sleep(1 / rate_limit)  # Ensuring we don't exceed rate limit

        elapsed = timeit.default_timer() - start_time
        logging.info(f"Compound-compound data fetching and saving completed in {elapsed:.2f} seconds.")
        return "Compound-compound data fetching and saving completed."


    def compound_gene_cooccurrence(self, gene_data, rate_limit=5):
        """
        Analyzes compound-gene co-occurrence relationships from the specified main data file and saves the results into structured CSV files.
        """
        logging.info("Starting compound-gene co-occurrence analysis...")
        start_time = timeit.default_timer()

        try:
            df = pd.read_csv(gene_data)
            logging.info(f"Loaded data from {gene_data}. Total rows: {len(df)}")
        except FileNotFoundError:
            logging.error(f"File not found: {gene_data}")
            return "File not found."
        except pd.errors.EmptyDataError:
            logging.error(f"Empty data file: {gene_data}")
            return "Empty data file."
        except Exception as e:
            logging.error(f"Error reading {gene_data}: {e}")
            return "Error reading data file."

        gene_symbols = df['GeneSymbol'].unique().tolist()
        logging.info(f"Unique Gene Symbols to process: {len(gene_symbols)}")

        for gene_symbol in gene_symbols:
            logging.info(f"Processing Gene Symbol {gene_symbol}")
            try:
                data = self._fetch_chemical_gene_data(gene_symbol)
                filename = f"Data/Relationships/Cpd_Gene_CoOccurrence/Cpd_Gene_CoOccurrence_{gene_symbol}.csv"
                self._write_data_to_csv(data, filename)
                logging.info(f"Successfully wrote data for Gene Symbol {gene_symbol} to {filename}")
            except Exception as e:
                logging.error(f"Error processing Gene Symbol {gene_symbol}: {e}")
            time.sleep(1 / rate_limit)  # Ensuring we don't exceed rate limit

        elapsed = timeit.default_timer() - start_time
        logging.info(f"Compound-gene data fetching and saving completed in {elapsed:.2f} seconds.")
        return "Compound-gene data fetching and saving completed."


    def compound_gene_interaction(self, gene_data, rate_limit=5):
        """
        Analyzes compound-gene co-occurrence relationships from the specified main data file and saves the results into structured CSV files.
        """
        logging.info("Starting compound-gene co-occurrence analysis...")
        start_time = timeit.default_timer()

        try:
            df = pd.read_csv(gene_data)
            logging.info(f"Loaded data from {gene_data}. Total rows: {len(df)}")
        except FileNotFoundError:
            logging.error(f"File not found: {gene_data}")
            return "File not found."
        except pd.errors.EmptyDataError:
            logging.error(f"Empty data file: {gene_data}")
            return "Empty data file."
        except Exception as e:
            logging.error(f"Error reading {gene_data}: {e}")
            return "Error reading data file."

        gene_symbols = df['GeneID'].unique().tolist()
        logging.info(f"Unique Gene Symbols to process: {len(gene_symbols)}")

        for gene_symbol in gene_symbols:
            logging.info(f"Processing Gene Symbol {gene_symbol}")
            try:
                data = self._fetch_chemical_gene_interaction_data(int(gene_symbol))
                filename = f"Data/Relationships/Compound_Gene_Relationship/Compound_Gene_Interaction_Outside_PubChem_{int(gene_symbol)}.csv"
                df = pd.DataFrame(data)
                if not df.empty:
                    # Reorder Columns
                    all_columns = [col for col in df.columns if col not in ('cid', 'geneid')]
                    all_columns = ['cid', 'geneid'] + all_columns
                    df = df[all_columns]
                    df.to_csv(filename, index=False)
                # self._write_data_to_csv(data, filename)
                logging.info(f"Successfully wrote data for Gene Symbol {int(gene_symbol)} to {filename}")
            except Exception as e:
                logging.error(f"Error processing Gene Symbol {int(gene_symbol)}: {e}")
            time.sleep(1 / rate_limit)  # Ensuring we don't exceed rate limit

        elapsed = timeit.default_timer() - start_time
        logging.info(f"Compound-gene data fetching and saving completed in {elapsed:.2f} seconds.")


    def compound_transformation(self, gene_properties):
        """
        Analyzes compound transformation data based on gene properties, focusing
        on metabolic transformations involving specified genes. This method
        queries the PubChem database for transformation data related
        to compounds associated with the genes identified in the provided CSV file.

        Parameters:
            gene_properties (str): Path to the CSV file containing gene properties
            generated by the NodePropertiesExtractor class, which should include
            'GeneID' as one of its columns. This file is used to identify genes
            of interest for which compound transformation data will be fetched.

        Processing Steps:
            1. Reads the provided CSV file to extract unique gene identifiers.
            2. For each gene identifier, constructs a query to fetch relevant
            compound transformation data from PubChem, focusing on metabolic
            transformations where the gene plays a role.
            3. Processes and aggregates the fetched data into a structured
            pandas DataFrame.
            4. Filters the aggregated data to retain specific columns relevant
            to compound transformations, including substrate and metabolite
            Compound IDs (CIDs), the type of metabolic conversion, gene
            identifiers, PubMed IDs, and DOIs for related publications.
            5. Saves the aggregated and filtered DataFrame to a CSV file for
            further analysis or integration into knowledge graphs or other
            data models.

        Returns:
            pandas.DataFrame: A DataFrame containing processed compound
            transformation data, including substrate and metabolite CIDs,
            metabolic conversion types, gene identifiers, PubMed IDs, and DOIs.
            The DataFrame structure facilitates further analysis or use in
            constructing detailed views of metabolic pathways involving the
            specified genes.

        Side Effects:
            - Saves the aggregated compound transformation data to
            'Data/Relationships/Compound_Transformation.csv'
            in the current working directory. This file captures the relationship
            between substrates, metabolites, and genes based on the input gene
            properties.

        Raises:
            FileNotFoundError: If the specified 'gene_properties' file does not
            exist or cannot be read.
            ValueError: If 'gene_properties' does not contain the required
            'GeneID' column.

        Example:
            >>> extractor = RelationshipPropertiesExtractor()
            >>> transformation_df = extractor.compound_transformation('Data/Nodes/gene_properties.csv')
            >>> print(transformation_df.head())
            This example processes gene properties from
            'path/to/gene_properties.csv', queries PubChem for
            compound transformation data related to the genes,
            and compiles the results into a DataFrame.

        Note:
            The method assumes that the input 'gene_properties' file is
            accessible and correctly formatted.
            The availability and structure of the PubChem database may affect
            the completeness and accuracy of the fetched transformation data.
            Users should verify the existence of the 'Data/Relationships'
            directory and have appropriate permissions to write files to it.
        """
        df = pd.read_csv(gene_properties)
        IDs = df['Target GeneID'].unique().tolist()

        transformation_dfs = []

        for gid in IDs:
            if not np.isnan(gid):
                gid = int(gid)
                base_url = 'https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=csv&query='
                query = {
                    "download": "*",
                    "collection": "chemblmetabolism",
                    "order": ["relevancescore,desc"],
                    "start": 1,
                    "limit": 10000000,
                    "downloadfilename": f"pubchem_geneid_{gid}_chemblmetabolism",
                    "where": {
                        "ands": [{"geneid": gid}]
                    }
                }

                # Convert the dictionary to a JSON string
                query_string = json.dumps(query)
                # URL encode the JSON string
                encoded_query = quote(query_string)
                # Construct the final URL
                url = f"{base_url}{encoded_query}"

                response = self._send_request(url)
                if response:
                    try:
                        # Read the CSV data
                        transformation_df = pd.read_csv(StringIO(response.text), sep=',', header=0, low_memory=False)
                        print(response.text)

                        # Ensure columns exist
                        transformation_df = transformation_df[['substratecid',
                                                               'metabolitecid',
                                                               'metconversion',
                                                               'geneids',
                                                               'pmids',
                                                               'dois']]

                        # Append the DataFrame to the list
                        transformation_dfs.append(transformation_df)
                    except pd.errors.ParserError as e:
                        logging.error(f"Error parsing CSV for gene ID {gid}: {e}\nurl:{url}")
                        continue  # Skip this gene ID and continue with others

        # Concatenate all DataFrames
        if transformation_dfs:
            transformation_df = pd.concat(transformation_dfs, ignore_index=True)
        else:
            transformation_df = pd.DataFrame(columns=['substratecid', 'metabolitecid', 'metconversion', 'geneids', 'pmids', 'dois'])

        self._write_to_csv(transformation_df, 'Data/Relationships/Compound_Transformation.csv')

        return transformation_df

__init__()

Initializes a RelationshipPropertiesExtractor with a Requests session for efficient network calls.

Source code in chemgraphbuilder/relationship_properties_extractor.py
65
66
67
68
def __init__(self):
    """Initializes a RelationshipPropertiesExtractor with a Requests session
     for efficient network calls."""
    self.session = requests.Session()

assay_compound_relationship(assays_data, start_chunk=0)

Processes and stores relationships between assays and compounds based on assay data from PubChem.

Parameters:

Name Type Description Default
assays_data str

Path to a CSV file containing assay IDs (AIDs).

required
start_chunk int

The starting index for processing chunks.

0
Source code in chemgraphbuilder/relationship_properties_extractor.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def assay_compound_relationship(self, assays_data, start_chunk=0):
    """
    Processes and stores relationships between assays and compounds based
    on assay data from PubChem.

    Parameters:
        assays_data (str): Path to a CSV file containing assay IDs (AIDs).
        start_chunk (int): The starting index for processing chunks.
    """
    for chunk_idx, chunk in enumerate(pd.read_csv(assays_data, chunksize=100)):
        if chunk_idx >= start_chunk:
            columns_to_remove = ['PUBCHEM_RESULT_TAG', 'PUBCHEM_SID', 'PUBCHEM_EXT_DATASOURCE_SMILES']
            output_dir = 'Data/Relationships/Assay_Compound_Relationship'

            for aid in chunk['AID']:
                if not os.path.exists(f'{output_dir}/AID_{aid}.csv'):
                    df = self.fetch_data_for_aid(aid, columns_to_remove)
                    if not df.empty:
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        df.to_csv(f'{output_dir}/AID_{aid}.csv', index=False)
            logging.info(f"Processed chunk {chunk_idx} for assay-compound relationships.")
        else:
            logging.info(f"No More Chunck to Process.")

assay_gene_relationship(main_data)

Extracts and saves relationships between assays and proteins from the specified dataset.

This method processes assay data to identify relationships between assays and their target proteins. It selects relevant columns from the input data, removes duplicates to ensure unique relationships, and saves the cleaned data to a CSV file for further analysis or integration into knowledge graphs.

Parameters:

Name Type Description Default
main_data str

Path to the CSV file containing the main data. The

required

Returns:

Type Description

pandas.DataFrame: A DataFrame containing the unique relationships

between assays and proteins, including the assay ID, target gene ID,

and activity name.

Side Effects
  • Writes a CSV file to 'Data/Relationships/Assay_Gene_Relationship.csv', containing the processed relationships data.
Source code in chemgraphbuilder/relationship_properties_extractor.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def assay_gene_relationship(self, main_data):
    """
    Extracts and saves relationships between assays and proteins from the
    specified dataset.

    This method processes assay data to identify relationships between
    assays and their target proteins. It selects relevant columns from the
    input data, removes duplicates to ensure unique relationships, and saves
    the cleaned data to a CSV file for further analysis or integration into
    knowledge graphs.

    Parameters:
        main_data (str): Path to the CSV file containing the main data. The
        file should include columns for 'AID' (Assay ID), 'Target GeneID',
        and 'Activity Name'.

    Returns:
        pandas.DataFrame: A DataFrame containing the unique relationships
        between assays and proteins, including the assay ID, target gene ID,
        and activity name.

    Side Effects:
        - Writes a CSV file to 'Data/Relationships/Assay_Gene_Relationship.csv',
        containing the processed relationships data.
    """
    df = pd.read_csv(main_data)
    columns_to_select = ['AID', 'Target GeneID', 'Activity Name']
    df = df[columns_to_select]
    df = df.drop_duplicates(keep='first', ignore_index=True)
    df.to_csv(f'Data/Relationships/Assay_Gene_Relationship.csv', index=False)
    return df

compound_compound_cooccurrence(main_data, rate_limit=5)

Analyzes compound-compound co-occurrence relationships from the specified main data file and saves the results into structured CSV files.

Parameters:

Name Type Description Default
main_data str

Path to the main data file.

required
rate_limit int

The maximum number of requests per second.

5

Returns:

Name Type Description
str

A message indicating the completion of data fetching and saving.

Source code in chemgraphbuilder/relationship_properties_extractor.py
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
def compound_compound_cooccurrence(self, main_data, rate_limit=5):
    """
    Analyzes compound-compound co-occurrence relationships from the specified main data file and saves the results into structured CSV files.

    Args:
        main_data (str): Path to the main data file.
        rate_limit (int): The maximum number of requests per second.

    Returns:
        str: A message indicating the completion of data fetching and saving.
    """
    logging.info("Starting compound-compound co-occurrence data retrieval ...")
    start_time = timeit.default_timer()

    try:
        df = pd.read_csv(main_data)
        logging.info(f"Loaded data from {main_data}. Total rows: {len(df)}")
    except FileNotFoundError:
        logging.error(f"File not found: {main_data}")
        return "File not found."
    except pd.errors.EmptyDataError:
        logging.error(f"Empty data file: {main_data}")
        return "Empty data file."
    except Exception as e:
        logging.error(f"Error reading {main_data}: {e}")
        return "Error reading data file."

    compound_ids = df['CID'].dropna().unique().tolist()
    logging.info(f"Unique Compound IDs to process: {len(compound_ids)}")

    for compound_id in compound_ids:
        logging.info(f"Processing Compound ID {int(compound_id)}")
        try:
            data = self._fetch_chemical_neighbor_data(int(compound_id))
            filename = f"Data/Relationships/Cpd_Cpd_CoOccurrence/CID_{int(compound_id)}.csv"
            self._write_data_to_csv(data, filename)
            logging.info(f"Successfully wrote data for Compound ID {int(compound_id)} to {filename}")
        except Exception as e:
            logging.error(f"Error processing Compound ID {int(compound_id)}: {e}")
        time.sleep(1 / rate_limit)  # Ensuring we don't exceed rate limit

    elapsed = timeit.default_timer() - start_time
    logging.info(f"Compound-compound data fetching and saving completed in {elapsed:.2f} seconds.")
    return "Compound-compound data fetching and saving completed."

compound_gene_cooccurrence(gene_data, rate_limit=5)

Analyzes compound-gene co-occurrence relationships from the specified main data file and saves the results into structured CSV files.

Source code in chemgraphbuilder/relationship_properties_extractor.py
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
def compound_gene_cooccurrence(self, gene_data, rate_limit=5):
    """
    Analyzes compound-gene co-occurrence relationships from the specified main data file and saves the results into structured CSV files.
    """
    logging.info("Starting compound-gene co-occurrence analysis...")
    start_time = timeit.default_timer()

    try:
        df = pd.read_csv(gene_data)
        logging.info(f"Loaded data from {gene_data}. Total rows: {len(df)}")
    except FileNotFoundError:
        logging.error(f"File not found: {gene_data}")
        return "File not found."
    except pd.errors.EmptyDataError:
        logging.error(f"Empty data file: {gene_data}")
        return "Empty data file."
    except Exception as e:
        logging.error(f"Error reading {gene_data}: {e}")
        return "Error reading data file."

    gene_symbols = df['GeneSymbol'].unique().tolist()
    logging.info(f"Unique Gene Symbols to process: {len(gene_symbols)}")

    for gene_symbol in gene_symbols:
        logging.info(f"Processing Gene Symbol {gene_symbol}")
        try:
            data = self._fetch_chemical_gene_data(gene_symbol)
            filename = f"Data/Relationships/Cpd_Gene_CoOccurrence/Cpd_Gene_CoOccurrence_{gene_symbol}.csv"
            self._write_data_to_csv(data, filename)
            logging.info(f"Successfully wrote data for Gene Symbol {gene_symbol} to {filename}")
        except Exception as e:
            logging.error(f"Error processing Gene Symbol {gene_symbol}: {e}")
        time.sleep(1 / rate_limit)  # Ensuring we don't exceed rate limit

    elapsed = timeit.default_timer() - start_time
    logging.info(f"Compound-gene data fetching and saving completed in {elapsed:.2f} seconds.")
    return "Compound-gene data fetching and saving completed."

compound_gene_interaction(gene_data, rate_limit=5)

Analyzes compound-gene co-occurrence relationships from the specified main data file and saves the results into structured CSV files.

Source code in chemgraphbuilder/relationship_properties_extractor.py
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
def compound_gene_interaction(self, gene_data, rate_limit=5):
    """
    Analyzes compound-gene co-occurrence relationships from the specified main data file and saves the results into structured CSV files.
    """
    logging.info("Starting compound-gene co-occurrence analysis...")
    start_time = timeit.default_timer()

    try:
        df = pd.read_csv(gene_data)
        logging.info(f"Loaded data from {gene_data}. Total rows: {len(df)}")
    except FileNotFoundError:
        logging.error(f"File not found: {gene_data}")
        return "File not found."
    except pd.errors.EmptyDataError:
        logging.error(f"Empty data file: {gene_data}")
        return "Empty data file."
    except Exception as e:
        logging.error(f"Error reading {gene_data}: {e}")
        return "Error reading data file."

    gene_symbols = df['GeneID'].unique().tolist()
    logging.info(f"Unique Gene Symbols to process: {len(gene_symbols)}")

    for gene_symbol in gene_symbols:
        logging.info(f"Processing Gene Symbol {gene_symbol}")
        try:
            data = self._fetch_chemical_gene_interaction_data(int(gene_symbol))
            filename = f"Data/Relationships/Compound_Gene_Relationship/Compound_Gene_Interaction_Outside_PubChem_{int(gene_symbol)}.csv"
            df = pd.DataFrame(data)
            if not df.empty:
                # Reorder Columns
                all_columns = [col for col in df.columns if col not in ('cid', 'geneid')]
                all_columns = ['cid', 'geneid'] + all_columns
                df = df[all_columns]
                df.to_csv(filename, index=False)
            # self._write_data_to_csv(data, filename)
            logging.info(f"Successfully wrote data for Gene Symbol {int(gene_symbol)} to {filename}")
        except Exception as e:
            logging.error(f"Error processing Gene Symbol {int(gene_symbol)}: {e}")
        time.sleep(1 / rate_limit)  # Ensuring we don't exceed rate limit

    elapsed = timeit.default_timer() - start_time
    logging.info(f"Compound-gene data fetching and saving completed in {elapsed:.2f} seconds.")

compound_gene_relationship(main_data)

Identifies and records relationships between compounds and proteins from the input data.

This method focuses on extracting compound-protein interaction data, including activity outcomes and values. It selects pertinent columns, removes duplicate records, and sorts the data by Compound ID and Target Accession for clarity. The cleaned dataset is then saved to a CSV file, providing a structured view of how compounds interact with various proteins, which can be critical for drug discovery and pharmacological research.

Parameters:

Name Type Description Default
main_data str

Path to the CSV file with compound and protein data.

required

Returns:

Type Description

pandas.DataFrame: A DataFrame with processed compound-protein

relationships, sorted and cleaned for direct analysis or database

insertion.

Side Effects
  • Saves the processed relationships data to 'Data/Relationships/Compound_Gene_Relationship.csv', facilitating easy access and integration.
Source code in chemgraphbuilder/relationship_properties_extractor.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def compound_gene_relationship(self, main_data):
    """
    Identifies and records relationships between compounds and proteins from
    the input data.

    This method focuses on extracting compound-protein interaction data,
    including activity outcomes and values. It selects
    pertinent columns, removes duplicate records, and sorts the data by
    Compound ID and Target Accession for clarity. The cleaned dataset is
    then saved to a CSV file, providing a structured view  of how compounds
    interact with various proteins, which can be critical for drug discovery
    and pharmacological research.

    Parameters:
        main_data (str): Path to the CSV file with compound and protein data.
        This file should contain columns for 'CID' (Compound ID),
        'Target Accession', 'Activity Outcome', 'Activity Name', and
        'Activity Value [uM]'.

    Returns:
        pandas.DataFrame: A DataFrame with processed compound-protein
        relationships, sorted and cleaned for direct analysis or database
        insertion.

    Side Effects:
        - Saves the processed relationships data to
        'Data/Relationships/Compound_Gene_Relationship.csv',
        facilitating easy access and integration.
    """
    df = pd.read_csv(main_data)
    columns_to_select = ['CID', 'Target GeneID', 'Target Accession',
                         'Activity Outcome', 'Activity Name',
                         'Activity Value [uM]']
    df = df[columns_to_select]
    df = df.drop_duplicates(keep='first', ignore_index=True)
    df = df.sort_values(['CID', 'Target Accession'])
    df.dropna(axis=0 , thresh=1, inplace=True) ###
    df.to_csv(f'Data/Relationships/Compound_Gene_Relationship.csv', index=False)
    return df

compound_similarity_relationship(main_data, start_chunk=0)

Identifies and records the similarity relationships between compounds based on a list of CIDs. The similarity is detrmined by the Tanimoto similarity coefficient with threshold 95% to ensure highe structural similarity.

This method reads a CSV file containing compound data, filters compounds based on specific 'Target GeneID' values, and fetches similar CIDs for each compound. The compounds are processed in chunks to manage memory usage and improve efficiency. The results are saved into separate CSV files for each chunk.

Parameters:

Name Type Description Default
main_data str

Path to the CSV file containing the main compound data.

required
start_chunk int

The starting index for processing chunks.

0

Note: - The method filters the main data for compounds associated with specific 'Target GeneID' values before fetching similar CIDs, optimizing the process for relevant compounds only. - The division of CIDs into chunks and concurrent processing helps in managing large datasets and utilizes parallelism for faster execution.

Source code in chemgraphbuilder/relationship_properties_extractor.py
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
def compound_similarity_relationship(self, main_data, start_chunk=0):
    """
    Identifies and records the similarity relationships between compounds
    based on a list of CIDs. The similarity is detrmined by the Tanimoto
    similarity coefficient with threshold 95% to ensure highe structural
    similarity.

    This method reads a CSV file containing compound data, filters compounds
    based on specific 'Target GeneID' values,
    and fetches similar CIDs for each compound. The compounds are processed
    in chunks to manage memory usage and improve efficiency. The results are
    saved into separate CSV files for each chunk.

    Parameters:
        main_data (str): Path to the CSV file containing the main compound data.
        start_chunk (int): The starting index for processing chunks.
    Note:
        - The method filters the main data for compounds associated with
        specific 'Target GeneID' values before fetching similar CIDs,
        optimizing the process for relevant compounds only.
        - The division of CIDs into chunks and concurrent processing helps
        in managing large datasets and utilizes parallelism for faster
        execution.
    """
    df = pd.read_csv(main_data)
    df = df[df['Target GeneID'].isin([1576, 1544, 1557, 1559, 1565])]
    df = df.dropna(subset=['CID'])
    IDs = df['CID'].unique().tolist()

    chunk_size=10000
    chunks = [IDs[i:i + chunk_size] for i in range(0, len(IDs), chunk_size)]

    for i, chunk in enumerate(chunks, start=0):
        if i >= start_chunk:
            chunk_results = self.process_chunk(chunk)
            chunk_df = pd.DataFrame(chunk_results, columns=['CID', 'Similar CIDs'])
            if not os.path.exists('Data/Relationships/Compound_Similarities'):
                os.makedirs('Data/Relationships/Compound_Similarities')
            chunk_df.to_csv(f'Data/Relationships/Compound_Similarities/Chunk_{i}.csv', index=False)
            logging.info(f"Processed chunk {i} for compound similarity relationships.")

compound_transformation(gene_properties)

Analyzes compound transformation data based on gene properties, focusing on metabolic transformations involving specified genes. This method queries the PubChem database for transformation data related to compounds associated with the genes identified in the provided CSV file.

Parameters:

Name Type Description Default
gene_properties str

Path to the CSV file containing gene properties

required
Processing Steps
  1. Reads the provided CSV file to extract unique gene identifiers.
  2. For each gene identifier, constructs a query to fetch relevant compound transformation data from PubChem, focusing on metabolic transformations where the gene plays a role.
  3. Processes and aggregates the fetched data into a structured pandas DataFrame.
  4. Filters the aggregated data to retain specific columns relevant to compound transformations, including substrate and metabolite Compound IDs (CIDs), the type of metabolic conversion, gene identifiers, PubMed IDs, and DOIs for related publications.
  5. Saves the aggregated and filtered DataFrame to a CSV file for further analysis or integration into knowledge graphs or other data models.

Returns:

Type Description

pandas.DataFrame: A DataFrame containing processed compound

transformation data, including substrate and metabolite CIDs,

metabolic conversion types, gene identifiers, PubMed IDs, and DOIs.

The DataFrame structure facilitates further analysis or use in

constructing detailed views of metabolic pathways involving the

specified genes.

Side Effects
  • Saves the aggregated compound transformation data to 'Data/Relationships/Compound_Transformation.csv' in the current working directory. This file captures the relationship between substrates, metabolites, and genes based on the input gene properties.

Raises:

Type Description
FileNotFoundError

If the specified 'gene_properties' file does not

ValueError

If 'gene_properties' does not contain the required

Example

extractor = RelationshipPropertiesExtractor() transformation_df = extractor.compound_transformation('Data/Nodes/gene_properties.csv') print(transformation_df.head()) This example processes gene properties from 'path/to/gene_properties.csv', queries PubChem for compound transformation data related to the genes, and compiles the results into a DataFrame.

Note

The method assumes that the input 'gene_properties' file is accessible and correctly formatted. The availability and structure of the PubChem database may affect the completeness and accuracy of the fetched transformation data. Users should verify the existence of the 'Data/Relationships' directory and have appropriate permissions to write files to it.

Source code in chemgraphbuilder/relationship_properties_extractor.py
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
def compound_transformation(self, gene_properties):
    """
    Analyzes compound transformation data based on gene properties, focusing
    on metabolic transformations involving specified genes. This method
    queries the PubChem database for transformation data related
    to compounds associated with the genes identified in the provided CSV file.

    Parameters:
        gene_properties (str): Path to the CSV file containing gene properties
        generated by the NodePropertiesExtractor class, which should include
        'GeneID' as one of its columns. This file is used to identify genes
        of interest for which compound transformation data will be fetched.

    Processing Steps:
        1. Reads the provided CSV file to extract unique gene identifiers.
        2. For each gene identifier, constructs a query to fetch relevant
        compound transformation data from PubChem, focusing on metabolic
        transformations where the gene plays a role.
        3. Processes and aggregates the fetched data into a structured
        pandas DataFrame.
        4. Filters the aggregated data to retain specific columns relevant
        to compound transformations, including substrate and metabolite
        Compound IDs (CIDs), the type of metabolic conversion, gene
        identifiers, PubMed IDs, and DOIs for related publications.
        5. Saves the aggregated and filtered DataFrame to a CSV file for
        further analysis or integration into knowledge graphs or other
        data models.

    Returns:
        pandas.DataFrame: A DataFrame containing processed compound
        transformation data, including substrate and metabolite CIDs,
        metabolic conversion types, gene identifiers, PubMed IDs, and DOIs.
        The DataFrame structure facilitates further analysis or use in
        constructing detailed views of metabolic pathways involving the
        specified genes.

    Side Effects:
        - Saves the aggregated compound transformation data to
        'Data/Relationships/Compound_Transformation.csv'
        in the current working directory. This file captures the relationship
        between substrates, metabolites, and genes based on the input gene
        properties.

    Raises:
        FileNotFoundError: If the specified 'gene_properties' file does not
        exist or cannot be read.
        ValueError: If 'gene_properties' does not contain the required
        'GeneID' column.

    Example:
        >>> extractor = RelationshipPropertiesExtractor()
        >>> transformation_df = extractor.compound_transformation('Data/Nodes/gene_properties.csv')
        >>> print(transformation_df.head())
        This example processes gene properties from
        'path/to/gene_properties.csv', queries PubChem for
        compound transformation data related to the genes,
        and compiles the results into a DataFrame.

    Note:
        The method assumes that the input 'gene_properties' file is
        accessible and correctly formatted.
        The availability and structure of the PubChem database may affect
        the completeness and accuracy of the fetched transformation data.
        Users should verify the existence of the 'Data/Relationships'
        directory and have appropriate permissions to write files to it.
    """
    df = pd.read_csv(gene_properties)
    IDs = df['Target GeneID'].unique().tolist()

    transformation_dfs = []

    for gid in IDs:
        if not np.isnan(gid):
            gid = int(gid)
            base_url = 'https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=csv&query='
            query = {
                "download": "*",
                "collection": "chemblmetabolism",
                "order": ["relevancescore,desc"],
                "start": 1,
                "limit": 10000000,
                "downloadfilename": f"pubchem_geneid_{gid}_chemblmetabolism",
                "where": {
                    "ands": [{"geneid": gid}]
                }
            }

            # Convert the dictionary to a JSON string
            query_string = json.dumps(query)
            # URL encode the JSON string
            encoded_query = quote(query_string)
            # Construct the final URL
            url = f"{base_url}{encoded_query}"

            response = self._send_request(url)
            if response:
                try:
                    # Read the CSV data
                    transformation_df = pd.read_csv(StringIO(response.text), sep=',', header=0, low_memory=False)
                    print(response.text)

                    # Ensure columns exist
                    transformation_df = transformation_df[['substratecid',
                                                           'metabolitecid',
                                                           'metconversion',
                                                           'geneids',
                                                           'pmids',
                                                           'dois']]

                    # Append the DataFrame to the list
                    transformation_dfs.append(transformation_df)
                except pd.errors.ParserError as e:
                    logging.error(f"Error parsing CSV for gene ID {gid}: {e}\nurl:{url}")
                    continue  # Skip this gene ID and continue with others

    # Concatenate all DataFrames
    if transformation_dfs:
        transformation_df = pd.concat(transformation_dfs, ignore_index=True)
    else:
        transformation_df = pd.DataFrame(columns=['substratecid', 'metabolitecid', 'metconversion', 'geneids', 'pmids', 'dois'])

    self._write_to_csv(transformation_df, 'Data/Relationships/Compound_Transformation.csv')

    return transformation_df

fetch_data_for_aid(aid, columns_to_remove)

Fetches and processes assay data for a specified Assay ID (AID) from the PubChem database, preparing it for analysis or further processing.

This method queries the PubChem database for assay data associated with a given AID. It constructs the query URL, sends the request using a previously established session, and processes the response. The response is expected to be in CSV format, which this method reads into a pandas DataFrame. Specific columns can be removed from this DataFrame based on the requirements for analysis. This allows for the customization of the fetched data, making it easier to work with specific datasets.

If the request is successful and the data is fetched without issues, it undergoes initial processing to remove unwanted columns as specified by the 'columns_to_remove' parameter. In case of an error during the data fetching or processing (e.g., issues with parsing the CSV data), appropriate error messages are logged, and an empty DataFrame is returned as a fallback.

Parameters:

Name Type Description Default
aid int

The assay ID for which data is to be fetched. This ID is

required
columns_to_remove list of str

A list of column names that should

required

Returns:

Type Description

pandas.DataFrame: A DataFrame containing the processed data

associated with the given AID. The DataFrame will exclude columns

listed in 'columns_to_remove'. If the data fetching fails or if

an error occurs during processing, an empty DataFrame is returned.

Raises:

Type Description
RequestException

If an error occurs during the HTTP request

ParserError

If an error occurs while parsing the CSV

Example

extractor = RelationshipPropertiesExtractor() processed_data_df = extractor.fetch_data_for_aid(12345, ['UnwantedColumn1', 'UnwantedColumn2']) print(processed_data_df.head()) This example demonstrates how to fetch and process assay data for the assay with ID 12345, removing 'UnwantedColumn1' and 'UnwantedColumn2' from the resulting DataFrame. The first few rows of the processed DataFrame are printed as an output.

Note
  • This method is part of a class that requires a valid session with the PubChem API. Ensure that the class is properly initialized and that the session is active.
  • The removal of columns is an optional step and can be customized based on the analysis needs. If no columns need to be removed, pass an empty list as 'columns_to_remove'.
Source code in chemgraphbuilder/relationship_properties_extractor.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def fetch_data_for_aid(self, aid, columns_to_remove):
    """
    Fetches and processes assay data for a specified Assay ID (AID) from the
    PubChem database, preparing it for analysis or further processing.

    This method queries the PubChem database for assay data associated with
    a given AID. It constructs the query URL, sends the request using a
    previously established session, and processes the response. The response
    is expected to be in CSV format, which this method reads into a pandas
    DataFrame. Specific columns can be removed from this DataFrame based on
    the requirements for analysis. This allows for the customization of
    the fetched data, making it easier to work with specific datasets.

    If the request is successful and the data is fetched without issues,
    it undergoes initial processing to remove unwanted columns as specified
    by the 'columns_to_remove' parameter. In case of an error during the
    data fetching or processing (e.g., issues with parsing the CSV data),
    appropriate error messages are logged, and an empty DataFrame is
    returned as a fallback.

    Parameters:
        aid (int): The assay ID for which data is to be fetched. This ID is
        used to construct the query URL to the PubChem database.
        columns_to_remove (list of str): A list of column names that should
        be removed from the fetched DataFrame. This allows for the exclusion
        of data that might not be relevant to the subsequent analysis or
        processing steps.

    Returns:
        pandas.DataFrame: A DataFrame containing the processed data
        associated with the given AID. The DataFrame will exclude columns
        listed in 'columns_to_remove'. If the data fetching fails or if
        an error occurs during processing, an empty DataFrame is returned.

    Raises:
        requests.RequestException: If an error occurs during the HTTP request
        to the PubChem API. This includes scenarios such as timeout issues,
        non-200 status codes, or network-related errors. The exception is
        handled internally with logging, but it's important to be aware of
        its possibility.
        pd.errors.ParserError: If an error occurs while parsing the CSV
        response from PubChem into a DataFrame. This could happen due to
        malformed data or unexpected changes in the response format.
        Like with RequestException, this error is logged and results in
        the return of an empty DataFrame.

    Example:
        >>> extractor = RelationshipPropertiesExtractor()
        >>> processed_data_df = extractor.fetch_data_for_aid(12345, ['UnwantedColumn1', 'UnwantedColumn2'])
        >>> print(processed_data_df.head())
        This example demonstrates how to fetch and process assay data for
        the assay with ID 12345, removing 'UnwantedColumn1' and
        'UnwantedColumn2' from the resulting DataFrame. The first few rows
        of the processed DataFrame are printed as an output.

    Note:
        - This method is part of a class that requires a valid session with
        the PubChem API. Ensure that the class is properly initialized and that
        the session is active.
        - The removal of columns is an optional step and can be customized
        based on the analysis needs. If no columns need to be removed, pass an
        empty list as 'columns_to_remove'.
    """
    url = (
        "https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?"
        "query=download&record_type=datatable&actvty="
        f"all&response_type=display&aid={aid}"
    )

    response = self._send_request(url)
    if response and response.status_code == 200:
        try:
            compound_df = pd.read_csv(StringIO(response.text), sep=',')
            # Drop specified columns and process column names in-place for memory efficiency
            columns_to_remove_set = set(columns_to_remove)
            existing_columns_set = set(compound_df.columns)
            columns_to_actually_remove = list(columns_to_remove_set & existing_columns_set)
            compound_df.drop(columns=columns_to_actually_remove,
                             errors='ignore', inplace=True)
            compound_df.rename(columns=lambda x: x.replace('PUBCHEM_', '') if x.startswith('PUBCHEM_') else x, inplace=True)

            # compound_df.drop(columns=[col for col in columns_to_remove if col in compound_df.columns], errors='ignore', inplace=True)
            # compound_df.columns = [col.replace('PUBCHEM_', '') if col.startswith('PUBCHEM_') else col for col in compound_df.columns]
            compound_df['AID'] = aid
            return compound_df
        except pd.errors.ParserError as e:
            logging.error(f"CSV parsing failed for AID {aid}: {e}")
    else:
        logging.error(f"Failed to fetch data for AID {aid}. Status code: {response.status_code if response else 'No Response'}")
    return pd.DataFrame()  # Return an empty DataFrame in case of failure

fetch_similar_cids(cid)

Fetches similar compound IDs (CIDs) from the PubChem database for a given compound ID (CID) using 2D similarity.

This method queries the PubChem database to find compounds that are similar to the given CID based on 2D structural similarity. The similarity threshold is set to 95%, and a maximum of 100 similar CIDs are fetched. The response is parsed from XML format to extract the similar CIDs.

Parameters:

Name Type Description Default
cid int

The compound ID for which similar CIDs are to be fetched.

required

Returns:

Name Type Description
tuple

A tuple containing the original CID and a list of similar

CIDs. If an error occurs, the list of similar CIDs will be empty.

Raises:

Type Description
Exception

Logs an error message with the original CID and the

Note
  • The method utilizes the requests library for HTTP requests and xml.etree.ElementTree for XML parsing.
  • In case of a request failure or parsing error, the method logs the error and returns the original CID with an empty list, allowing the calling function to handle the exception as needed.
Source code in chemgraphbuilder/relationship_properties_extractor.py
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
def fetch_similar_cids(self, cid):
    """
    Fetches similar compound IDs (CIDs) from the PubChem database for a
    given compound ID (CID) using 2D similarity.

    This method queries the PubChem database to find compounds that are
    similar to the given CID based on 2D structural similarity.
    The similarity threshold is set to 95%, and a maximum of 100 similar
    CIDs are fetched. The response is parsed from XML format to extract
    the similar CIDs.

    Parameters:
        cid (int): The compound ID for which similar CIDs are to be fetched.

    Returns:
        tuple: A tuple containing the original CID and a list of similar
        CIDs. If an error occurs, the list of similar CIDs will be empty.

    Raises:
        Exception: Logs an error message with the original CID and the
        exception if the request to PubChem fails or if parsing the XML
        response encounters an error.

    Note:
        - The method utilizes the `requests` library for HTTP requests and
        `xml.etree.ElementTree` for XML parsing.
        - In case of a request failure or parsing error, the method logs
        the error and returns the original CID with an empty list,
        allowing the calling function to handle the exception as needed.
    """
    url = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/"
           f"fastsimilarity_2d/cid/{int(cid)}/cids/XML?Threshold=95&MaxRecords=100")
    try:
        response = requests.get(url)
        response.raise_for_status()
        xml_data = response.text

        # Parse XML data
        tree = ET.parse(io.StringIO(xml_data))
        root = tree.getroot()

        # Extracting CID values
        similar_cids = [element.text for element in root.findall('{http://pubchem.ncbi.nlm.nih.gov/pug_rest}CID')]
        return cid, similar_cids
    except Exception as e:
        logging.error(f"Error processing CID {cid}: {e}")
        return cid, []

gene_protein_relationship(main_data)

Extracts and saves relationships between genes and proteins based on the provided dataset.

This method selects relevant columns to highlight the relationships between genes and their corresponding proteins. It removes duplicate entries to ensure that each relationship is represented uniquely and saves the resultant data to a CSV file. This facilitates easy integration of genetic data into knowledge bases or further analysis.

Parameters:

Name Type Description Default
main_data str

Path to the CSV file containing gene and protein data.

required

Returns:

Type Description

pandas.DataFrame: A DataFrame of unique gene-protein relationships,

including gene ID and protein accession numbers.

Side Effects
  • Writes the processed data to 'Data/Gene_Protein_Relationship.csv' in a structured CSV format.
Source code in chemgraphbuilder/relationship_properties_extractor.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
def gene_protein_relationship(self, main_data):
    """
    Extracts and saves relationships between genes and proteins based on
    the provided dataset.

    This method selects relevant columns to highlight the relationships
    between genes and their corresponding proteins.
    It removes duplicate entries to ensure that each relationship is
    represented uniquely and saves the resultant data to
    a CSV file. This facilitates easy integration of genetic data into
    knowledge bases or further analysis.

    Parameters:
        main_data (str): Path to the CSV file containing gene and protein data.
        Expected columns include 'Target GeneID' and 'Target Accession'.

    Returns:
        pandas.DataFrame: A DataFrame of unique gene-protein relationships,
        including gene ID and protein accession numbers.

    Side Effects:
        - Writes the processed data to 'Data/Gene_Protein_Relationship.csv'
        in a structured CSV format.
    """
    df = pd.read_csv(main_data)
    columns_to_select = ['Target GeneID', 'Target Accession']
    df = df[columns_to_select]
    df = df.drop_duplicates(keep='first', ignore_index=True)
    df.to_csv(f'Data/Relationships/Gene_Protein_Relationship.csv', index=False)
    return df

process_chunk(chunk)

Processes a chunk of CIDs in parallel to fetch similar CIDs for each CID in the chunk.

This method uses a ThreadPoolExecutor to send out concurrent requests for fetching similar CIDs for a list of CIDs. The number of worker threads is set to 5. Each CID's request is handled by fetch_similar_cids method.

Parameters:

Name Type Description Default
chunk list of int

A list of compound IDs (CIDs) to process in

required

Returns:

Type Description

list of tuples: A list of tuples, each containing a CID and its

corresponding list of similar CIDs.

Side Effects
  • Utilizes concurrent threads to speed up the fetching process.
  • May log errors if any occur during the fetching of similar CIDs for individual CIDs.
Source code in chemgraphbuilder/relationship_properties_extractor.py
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
def process_chunk(self, chunk):
    """
    Processes a chunk of CIDs in parallel to fetch similar CIDs for each CID
    in the chunk.

    This method uses a ThreadPoolExecutor to send out concurrent requests for
    fetching similar CIDs for a list of CIDs.
    The number of worker threads is set to 5. Each CID's request is handled
    by `fetch_similar_cids` method.

    Parameters:
        chunk (list of int): A list of compound IDs (CIDs) to process in
        parallel.

    Returns:
        list of tuples: A list of tuples, each containing a CID and its
        corresponding list of similar CIDs.

    Side Effects:
        - Utilizes concurrent threads to speed up the fetching process.
        - May log errors if any occur during the fetching of similar CIDs
        for individual CIDs.
    """
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(self.fetch_similar_cids, cid) for cid in chunk]
        results = [future.result() for future in as_completed(futures)]
    return results

7. Relationship Data Processor

RelationshipDataProcessor

A class to process relationship data files, filtering and augmenting the data.

Attributes:

Name Type Description
path str

The directory path where the data files are stored.

csv_files list

List of CSV files matching the pattern 'AID_*.csv'.

all_data_connected dict

A dictionary containing additional data connected to assays.

Source code in chemgraphbuilder/relationship_data_processor.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
class RelationshipDataProcessor:
    """
    A class to process relationship data files, filtering and augmenting the data.

    Attributes:
        path (str): The directory path where the data files are stored.
        csv_files (list): List of CSV files matching the pattern 'AID_*.csv'.
        all_data_connected (dict): A dictionary containing additional data connected to assays.
    """

    def __init__(self, path, start_chunk=0):
        """
        Initializes the RelationshipDataProcessor with the specified path and start chunk index.

        Args:
            path (str): The directory path containing the CSV files.
            start_chunk (int): The starting index for processing chunks.
        """
        self.path = path
        self.csv_files = glob.glob(os.path.join(path, "AID_*.csv"))
        self.start_chunk = start_chunk
        self.all_data_connected = {}
        self.unique_column_names = []

        # Check if the all_data_connected_dict and all_columns files exist
        all_data_connected_file = 'Data/Relationships/all_data_connected_dict.txt'
        all_columns_file = 'Data/Relationships/all_columns.txt'

        if os.path.exists(all_data_connected_file):
            self.all_data_connected = self._load_all_data_connected_from_file(all_data_connected_file)
        else:
            self.all_data_connected = self._load_all_data_connected('Data/AllDataConnected.csv')

        if os.path.exists(all_columns_file):
            self.unique_column_names = self._load_columns_from_file(all_columns_file)
        else:
            self.unique_column_names = self._get_filtered_columns()
            self._save_columns_to_file(all_columns_file, self.unique_column_names)

        # Ensure the 'activity' column is included
        if 'activity' not in self.unique_column_names:
            self.unique_column_names.append('activity')


    def _load_all_data_connected(self, file_path):
        """
        Loads additional data from a specified file and organizes it into a dictionary.

        Args:
            file_path (str): The path to the file containing additional data.

        Returns:
            dict: A dictionary with keys as tuples of (aid, cid, activity_outcome)
                  and values as dictionaries of additional information.
        """
        all_data_connected = {}
        df = pd.read_csv(file_path)
        df.columns = [col.replace(' ', '_').lower() for col in df.columns]
        df = df.dropna(subset=['aid', 'cid'], how='any')
        for _, row in df.iterrows():
            key = (int(row['aid']), int(row['cid']), row['activity_outcome'])
            all_data_connected[key] = row.to_dict()

        # Optionally save the dictionary to a file
        self._save_all_data_connected_to_file(all_data_connected)

        return all_data_connected


    def _save_all_data_connected_to_file(self, all_data_connected):
        """
        Saves the all_data_connected dictionary to a file.

        Args:
            all_data_connected (dict): The dictionary to save.
        """
        with open("Data/Relationships/all_data_connected_dict.txt", "w") as file:
            for key, value in all_data_connected.items():
                key_str = f"({key[0]},{key[1]},'{key[2]}')"
                # Replace nan values with a placeholder string
                for k, v in value.items():
                    if pd.isna(v):
                        value[k] = "__nan__"
                file.write(f"{key_str}: {json.dumps(value)}\n")


    def _load_all_data_connected_from_file(self, file_path):
        """
        Loads the all_data_connected dictionary from a file.

        Args:
            file_path (str): The path to the file containing the dictionary.

        Returns:
            dict: The loaded dictionary.
        """
        all_data_connected = {}
        with open(file_path, "r") as file:
            for line in file:
                line = line.strip()
                if not line or ": " not in line:
                    logging.warning(f"Skipping improperly formatted or empty line: {line}")
                    continue
                try:
                    key_str, value_str = line.split(": ", 1)
                    # Ensure correct parsing of key_str
                    key_str = key_str.strip("()").split(",")
                    key = (int(key_str[0]), int(key_str[1]), key_str[2].strip("'"))
                    value_dict = json.loads(value_str)
                    # Convert '__nan__' back to np.nan
                    for k, v in value_dict.items():
                        if v == "__nan__":
                            value_dict[k] = np.nan
                    all_data_connected[key] = value_dict
                except (json.JSONDecodeError, ValueError) as e:
                    logging.error(f"Error decoding JSON from line: {line}\n{e}")
        return all_data_connected


    def _get_filtered_columns(self):
        """
        Extracts unique column names from the CSV files and additional data.

        Returns:
            list: A list of unique column names.
        """
        all_columns = set()

        # Extract additional columns from the all_data_connected dictionary
        additional_columns = set()
        for value in self.all_data_connected.values():
            additional_columns.update(value.keys())

        def read_columns(file):
            try:
                # Read only column names from the CSV file
                df = pd.read_csv(file, nrows=0)
                return set([col.replace(' ', '_').lower() for col in df.columns])
            except Exception as e:
                logging.error(f"Error reading {file}: {e}")
                return set()

        # Use ThreadPoolExecutor for concurrent reading of columns from multiple files
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            results = list(executor.map(read_columns, self.csv_files))

        for columns in results:
            all_columns.update(columns)

        all_columns.update(additional_columns)
        all_columns = list(all_columns)

        # Reorder Columns
        all_columns = [col for col in all_columns if col not in ('aid', 'cid')]
        all_columns = ['aid', 'cid'] + all_columns

        return all_columns


    def _save_columns_to_file(self, file_path, columns):
        """
        Saves the list of columns to a file.

        Args:
            file_path (str): The path to the file.
            columns (list): The list of columns to save.
        """
        with open(file_path, "w") as file:
            for item in columns:
                file.write(f"{item}\n")


    def _load_columns_from_file(self, file_path):
        """
        Loads the list of columns from a file.

        Args:
            file_path (str): The path to the file.

        Returns:
            list: The loaded list of columns.
        """
        with open(file_path, "r") as file:
            columns = [line.strip() for line in file]
        return columns


    def _add_all_data_connected_info(self, row):
        """
        Adds additional information from all_data_connected to a row.

        Args:
            row (pd.Series): A row from a DataFrame.

        Returns:
            pd.Series: The updated row with additional data if available.
        """
        key = (int(row['aid']), int(row['cid']), row['activity_outcome'])
        if key in self.all_data_connected:
            additional_info = self.all_data_connected[key]
            for col, val in additional_info.items():
                row[col] = val
        else:
            logging.warning(f"Key {key} not found in all_data_connected.")
        return row


    def process_files(self):
        """
        Processes the CSV files by filtering, cleaning, and augmenting data.

        The processed data is saved to output files.
        """
        self._filter_and_clean_data()
        logging.info("Data filtered, cleaned, and combined successfully.")


    def _filter_and_clean_data(self):
        """
        Filters and cleans data from CSV files, then saves to output files in chunks.
        """
        base_output_file = 'Data/Relationships/Assay_Compound_Relationship_Processed/Assay_Compound_Relationship'
        base_compound_gene_file = 'Data/Relationships/Compound_Gene_Relationship/Compound_Gene_Relationship'

        # Process files in batches of 100 
        batch_size = 100
        total_files = len(self.csv_files)

        for batch_index in range(0, total_files, batch_size):
            if batch_index >= self.start_chunk * batch_size:
                batch_files = self.csv_files[batch_index:batch_index + batch_size]
                batch_output_file = f"{base_output_file}_batch_{batch_index // batch_size + 1}.csv"
                batch_compound_gene_file = f"{base_compound_gene_file}_batch_{batch_index // batch_size + 1}.csv"

                # Initialize output files with headers for each batch
                pd.DataFrame(columns=self.unique_column_names).to_csv(batch_output_file, index=False)
                pd.DataFrame(columns=['cid', 'target_geneid', 'activity', 'aid']).to_csv(batch_compound_gene_file, index=False)

                for file in batch_files:
                    logging.info(f"Processing file {file}")
                    self._process_file(file, self.unique_column_names, batch_output_file, batch_compound_gene_file)

                logging.info(f"Processed batch {batch_index // batch_size + 1} of {total_files // batch_size + 1}")


    def _process_file(self, file, unique_column_names, output_file, compound_gene_file):
        """
        Processes a single CSV file, applying filtering, cleaning, and adding data.

        Args:
            file (str): The file path to the CSV file.
            unique_column_names (list): The list of unique column names to use.
            output_file (str): The path to the output file for combined data.
            compound_gene_file (str): The path to the output file for compound-gene relationships.
        """
        try:
            df = pd.read_csv(file, dtype={'ASSAYDATA_COMMENT': 'object'})
            df.columns = [col.replace(' ', '_').lower() for col in df.columns]
            df = df.dropna(subset=['cid'], how='any')

            phenotype_cols = [col for col in df.columns if col.startswith('phenotype')]

            if isinstance(df, pd.Series):
                df = df.to_frame().T  # Convert to DataFrame if a Series is encountered
            if df.columns.duplicated().any():
                df = df.loc[:, ~df.columns.duplicated()]
                logging.info("Duplicated columns removed from partition.")
            df = df.reindex(columns=unique_column_names, fill_value=pd.NA)
            df = df.dropna(subset=['aid', 'cid'], how='any')

            if not df.empty:
                df['measured_activity'] = df[phenotype_cols].apply(lambda row: row.mode()[0] if not row.mode().empty else None, axis=1)

                df = df.apply(self._add_all_data_connected_info, axis=1)

                if any(col in df.columns for col in phenotype_cols) and df['activity_outcome'].notna().all():
                    df = df.groupby(['activity_outcome', 'assay_name']).apply(self.propagate_phenotype).reset_index(drop=True)

                if 'target_geneid' not in df.columns:
                    df['target_geneid'] = pd.NA

                if 'sid' in df.columns:
                    df['activity_url'] = df.apply(lambda row: f"https://pubchem.ncbi.nlm.nih.gov/bioassay/{row['aid']}#sid={row['sid']}", axis=1)
                else:
                    df['activity_url'] = pd.NA

                # Drop rows where both aid and cid are 1
                df = df[(df['aid'] != 1) | (df['cid'] != 1)]

                df = self._determine_labels_and_activity(df)

                logging.info(f"Processed file {file} with {len(df)} rows.")
                if not df.empty:
                    # Write the processed data to the output files
                    df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
                    df[['cid', 'target_geneid', 'activity', 'aid']].to_csv(compound_gene_file, mode='a', header=not os.path.exists(compound_gene_file), index=False)
            else:
                logging.info(f"No data to process in file {file} after filtering.")
        except Exception as e:
            logging.error(f"Error processing file {file}: {e}")

    @staticmethod
    def most_frequent(row):
        """
        Finds the most frequent value in a row, excluding NaN values.

        Args:
            row (pd.Series): A row from a DataFrame.

        Returns:
            str: The most frequent value in the row.
        """
        values = row.dropna()
        string_values = values[values.apply(lambda x: isinstance(x, str))]
        return string_values.mode()[0] if not string_values.empty else None

    @staticmethod
    def propagate_phenotype(group):
        """
        Propagates the phenotype information within a group.

        Args:
            group (pd.DataFrame): A DataFrame group.

        Returns:
            pd.DataFrame: The updated group with propagated phenotype information.
        """
        phenotype_value = group['phenotype'].dropna().unique()
        if len(phenotype_value) > 0:
            group['phenotype'] = phenotype_value[0]
        return group

    def _determine_labels_and_activity(self, merged_df):
        """
        Determines the activity labels for the data based on predefined keywords.

        Args:
            merged_df (pd.DataFrame): The DataFrame containing merged data.

        Returns:
            pd.DataFrame: The DataFrame with determined activity labels.
        """
        inhibitor_keywords = [
            'inhibition', 'reversible inhibition', 'time dependent inhibition',
            'inhibitory activity', 'time-dependent inhibition', 'time dependent irreversible inhibition',
            'inhibitory concentration', 'inhibitory effect', 'inhibitory potency',
            'concentration required to inhibit', 'competitive inhibition', 'cyp inhibition',
            'irreversible inhibition', 'mechanism based inhibition', 'mixed inhibition',
            'mixed type inhibition', 'inhibitory constant', 'antagonistic activity', 'selectivity',
            's1p4 agonists', 'small molecule antagonists', 'displacement', 'mediated midazolam 1-hydroxylation',
            'time/nadph-dependent inhibition', 'reversal inhibition', 'mechanism-based inhibition',
            'mechanism based time dependent inhibition', 'reversible competitive inhibition',
            'predictive competitive inhibition','noncompetitive inhibition', 'in vitro inhibitory',
            'in vitro inhibition', 'inhibition of', 'direct inhibition','enzyme inhibition', 'dndi',
            'inhibition assay'
        ]

        ligand_keywords = [
            'binding affinity', 'spectral binding', 'interaction with', 'bind',
            'covalent binding affinity', 'apparent binding affinity'
        ]

        inhibitor_substrate_keywords = [
            'inhibitors and substrates'
        ]

        inhibitor_activator_modulator_keywords = [
            'apoprotein formation', 'panel assay', 'eurofins-panlabs enzyme assay'
        ]

        substrate_keywords = [
            'drug metabolism', 'prodrug', 'metabolic', 'oxidation', 'substrate activity',
            'michaelis-menten', 'metabolic stability', 'bioactivation', 'drug level',
            'enzyme-mediated drug depletion', 'enzyme-mediated compound formation',
            'phenotyping', 'activity of human recombinant cyp', 'activity of recombinant cyp',
            'activity at cyp', 'enzyme-mediated drug metabolism'
        ]

        inactivator_keywords = [
            'inactivator', 'inactivation of', 'mechanism based inactivation of', 'inactivators',
            'metabolism dependent inactivation'
        ]

        activator_keywords = [
            'assay for activators', 'activation of', 'activators of'
        ]

        inducer_keywords = [
            'induction of', 'inducer', 'inducers', 'time-dependant induction'
        ]

        all_keywords = (inhibitor_keywords + ligand_keywords + inhibitor_substrate_keywords +
                        inhibitor_activator_modulator_keywords + substrate_keywords +
                        inactivator_keywords + activator_keywords + inducer_keywords)

        keyword_to_label = {
            **{keyword: 'Inhibitor' for keyword in inhibitor_keywords},
            **{keyword: 'Inhibitor/Substrate' for keyword in inhibitor_substrate_keywords},
            **{keyword: 'Inhibitor/Inducer/Modulator' for keyword in inhibitor_activator_modulator_keywords},
            **{keyword: 'Substrate' for keyword in substrate_keywords},
            **{keyword: 'Inactivator' for keyword in inactivator_keywords},
            **{keyword: 'Activator' for keyword in activator_keywords},
            **{keyword: 'Inducer' for keyword in inducer_keywords},
            **{keyword: 'Ligand' for keyword in ligand_keywords},
        }

        def determine_active_label(assay_name):
            # Determine the appropriate label based on the first keyword found in the assay name
            assay_name_lower = assay_name.lower()
            first_keyword = None
            first_position = len(assay_name_lower)

            for keyword in all_keywords:
                position = assay_name_lower.find(keyword)
                if 0 <= position < first_position:
                    first_keyword = keyword
                    first_position = position

            if first_keyword:
                return keyword_to_label[first_keyword]
            return 'Inhibitor/Inducer/Modulator'

        merged_df['activity'] = None

        # Assign the 'Inactive' label where the activity outcome is inactive
        inactive_mask = merged_df['activity_outcome'] == 'Inactive'
        merged_df.loc[inactive_mask, 'activity'] = 'Inactive'

        # Assign labels based on assay name keywords for active outcomes
        active_mask = merged_df['activity_outcome'] == 'Active'
        if active_mask.any():
            merged_df.loc[active_mask, 'activity'] = merged_df.loc[active_mask, 'assay_name'].apply(determine_active_label)
            merged_df.loc[active_mask & merged_df['activity_name'].isin(['Km', 'Drug metabolism']), 'activity'] = 'Substrate'

            # Define the patterns with non-capturing groups
            substrate_pattern = r'(?:activity of.*oxidation)|(?:activity at cyp.*phenotyping)|(?:activity at human recombinant cyp.*formation)|(?:activity at recombinant cyp.*formation)'
            ActIndMod_pattern = r'(?:effect on cyp)|(?:effect on human recombinant cyp)|(?:effect on recombinant cyp)|(?:effect on human cyp)'
            inducer_pattern = r'(?:effect on cyp.*induction)|(?:induction of.*)'

            merged_df.loc[active_mask & merged_df['assay_name'].str.contains(substrate_pattern, case=False, regex=True), 'activity'] = 'Substrate'
            merged_df.loc[active_mask & merged_df['assay_name'].str.contains(ActIndMod_pattern, case=False, regex=True), 'activity'] = 'Inhibitor/Inducer/Modulator'
            merged_df.loc[active_mask & merged_df['assay_name'].str.contains(inducer_pattern, case=False, regex=True), 'activity'] = 'Inducer'

            merged_df.loc[active_mask & merged_df['activity_direction'].str.contains('decreasing', case=False), 'activity'] = 'Inhibitor'
            merged_df.loc[active_mask & merged_df['activity_direction'].str.contains('increasing', case=False), 'activity'] = 'Activator'
            merged_df.loc[active_mask & (merged_df['aid'] == 1215398), 'activity'] = 'Inactivator'

        return merged_df

__init__(path, start_chunk=0)

Initializes the RelationshipDataProcessor with the specified path and start chunk index.

Parameters:

Name Type Description Default
path str

The directory path containing the CSV files.

required
start_chunk int

The starting index for processing chunks.

0
Source code in chemgraphbuilder/relationship_data_processor.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(self, path, start_chunk=0):
    """
    Initializes the RelationshipDataProcessor with the specified path and start chunk index.

    Args:
        path (str): The directory path containing the CSV files.
        start_chunk (int): The starting index for processing chunks.
    """
    self.path = path
    self.csv_files = glob.glob(os.path.join(path, "AID_*.csv"))
    self.start_chunk = start_chunk
    self.all_data_connected = {}
    self.unique_column_names = []

    # Check if the all_data_connected_dict and all_columns files exist
    all_data_connected_file = 'Data/Relationships/all_data_connected_dict.txt'
    all_columns_file = 'Data/Relationships/all_columns.txt'

    if os.path.exists(all_data_connected_file):
        self.all_data_connected = self._load_all_data_connected_from_file(all_data_connected_file)
    else:
        self.all_data_connected = self._load_all_data_connected('Data/AllDataConnected.csv')

    if os.path.exists(all_columns_file):
        self.unique_column_names = self._load_columns_from_file(all_columns_file)
    else:
        self.unique_column_names = self._get_filtered_columns()
        self._save_columns_to_file(all_columns_file, self.unique_column_names)

    # Ensure the 'activity' column is included
    if 'activity' not in self.unique_column_names:
        self.unique_column_names.append('activity')

most_frequent(row) staticmethod

Finds the most frequent value in a row, excluding NaN values.

Parameters:

Name Type Description Default
row Series

A row from a DataFrame.

required

Returns:

Name Type Description
str

The most frequent value in the row.

Source code in chemgraphbuilder/relationship_data_processor.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
@staticmethod
def most_frequent(row):
    """
    Finds the most frequent value in a row, excluding NaN values.

    Args:
        row (pd.Series): A row from a DataFrame.

    Returns:
        str: The most frequent value in the row.
    """
    values = row.dropna()
    string_values = values[values.apply(lambda x: isinstance(x, str))]
    return string_values.mode()[0] if not string_values.empty else None

process_files()

Processes the CSV files by filtering, cleaning, and augmenting data.

The processed data is saved to output files.

Source code in chemgraphbuilder/relationship_data_processor.py
219
220
221
222
223
224
225
226
def process_files(self):
    """
    Processes the CSV files by filtering, cleaning, and augmenting data.

    The processed data is saved to output files.
    """
    self._filter_and_clean_data()
    logging.info("Data filtered, cleaned, and combined successfully.")

propagate_phenotype(group) staticmethod

Propagates the phenotype information within a group.

Parameters:

Name Type Description Default
group DataFrame

A DataFrame group.

required

Returns:

Type Description

pd.DataFrame: The updated group with propagated phenotype information.

Source code in chemgraphbuilder/relationship_data_processor.py
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
@staticmethod
def propagate_phenotype(group):
    """
    Propagates the phenotype information within a group.

    Args:
        group (pd.DataFrame): A DataFrame group.

    Returns:
        pd.DataFrame: The updated group with propagated phenotype information.
    """
    phenotype_value = group['phenotype'].dropna().unique()
    if len(phenotype_value) > 0:
        group['phenotype'] = phenotype_value[0]
    return group

8. Add Graph Relationships

Module to set up a data directory with a predefined structure.

This module provides the DataFolderSetup class, which creates a directory structure for a data folder. The structure includes nodes and relationships folders with specified subfolders.

Classes:

Name Description
DataFolderSetup

Class to set up a data directory with a predefined structure.

Functions:

Name Description
main

Main function to set up the data directory.

SetupDataFolder

Class to set up a data directory with a predefined structure.

Attributes:

Name Type Description
data_folder str

The name of the data folder.

base_path str

The base path for the data directory.

structure dict

The structure of directories to create.

Source code in chemgraphbuilder/setup_data_folder.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class SetupDataFolder:
    """
    Class to set up a data directory with a predefined structure.

    Attributes:
        data_folder (str): The name of the data folder.
        base_path (str): The base path for the data directory.
        structure (dict): The structure of directories to create.
    """

    def __init__(self):
        """
        Initializes the DataFolderSetup with the data folder name and directory structure.
        """
        self.data_folder = "Data"
        self.base_path = os.path.join(os.getcwd(), self.data_folder)
        self.structure = {
            "Nodes": ["Compound_Properties"],
            "Relationships": [
                "Assay_Compound_Relationship",
                "Compound_Similarities",
                "Cpd_Cpd_CoOccurrence",
                "Cpd_Gene_CoOccurrence",
                "Compound_Gene_Relationship",
                "Assay_Compound_Relationship_Processed"
            ]
        }

    @staticmethod
    def create_folder(path):
        """
        Creates a folder if it does not already exist.

        Args:
            path (str): The path of the folder to create.
        """
        if not os.path.exists(path):
            os.makedirs(path)
            print(f"Created folder: {path}")
        else:
            print(f"Folder already exists: {path}")

    def setup(self):
        """
        Sets up the data directory structure based on the predefined structure.
        """
        # Create the base data directory
        self.create_folder(self.base_path)

        # Create the 'Nodes' directory and its subdirectories
        nodes_path = os.path.join(self.base_path, "Nodes")
        self.create_folder(nodes_path)
        for folder in self.structure["Nodes"]:
            self.create_folder(os.path.join(nodes_path, folder))

        # Create the 'Relationships' directory and its subdirectories
        relationships_path = os.path.join(self.base_path, "Relationships")
        self.create_folder(relationships_path)
        for folder in self.structure["Relationships"]:
            self.create_folder(os.path.join(relationships_path, folder))

__init__()

Initializes the DataFolderSetup with the data folder name and directory structure.

Source code in chemgraphbuilder/setup_data_folder.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(self):
    """
    Initializes the DataFolderSetup with the data folder name and directory structure.
    """
    self.data_folder = "Data"
    self.base_path = os.path.join(os.getcwd(), self.data_folder)
    self.structure = {
        "Nodes": ["Compound_Properties"],
        "Relationships": [
            "Assay_Compound_Relationship",
            "Compound_Similarities",
            "Cpd_Cpd_CoOccurrence",
            "Cpd_Gene_CoOccurrence",
            "Compound_Gene_Relationship",
            "Assay_Compound_Relationship_Processed"
        ]
    }

create_folder(path) staticmethod

Creates a folder if it does not already exist.

Parameters:

Name Type Description Default
path str

The path of the folder to create.

required
Source code in chemgraphbuilder/setup_data_folder.py
45
46
47
48
49
50
51
52
53
54
55
56
57
@staticmethod
def create_folder(path):
    """
    Creates a folder if it does not already exist.

    Args:
        path (str): The path of the folder to create.
    """
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created folder: {path}")
    else:
        print(f"Folder already exists: {path}")

setup()

Sets up the data directory structure based on the predefined structure.

Source code in chemgraphbuilder/setup_data_folder.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def setup(self):
    """
    Sets up the data directory structure based on the predefined structure.
    """
    # Create the base data directory
    self.create_folder(self.base_path)

    # Create the 'Nodes' directory and its subdirectories
    nodes_path = os.path.join(self.base_path, "Nodes")
    self.create_folder(nodes_path)
    for folder in self.structure["Nodes"]:
        self.create_folder(os.path.join(nodes_path, folder))

    # Create the 'Relationships' directory and its subdirectories
    relationships_path = os.path.join(self.base_path, "Relationships")
    self.create_folder(relationships_path)
    for folder in self.structure["Relationships"]:
        self.create_folder(os.path.join(relationships_path, folder))

main()

Main function to set up the data directory.

Source code in chemgraphbuilder/setup_data_folder.py
78
79
80
81
82
83
def main():
    """
    Main function to set up the data directory.
    """
    data_folder_setup = SetupDataFolder()
    data_folder_setup.setup()