Skip to content

Excel To SQL Module

Excel Extraction

Application to open and extract data from version of amphibian dataset in excel.

CLI interface takes file path argument and extracts data. Then cleans dataset then passes the data to api.

This tool accepts comma separated value files (.csv) as well as Excel files (.xlsx, xls).

This script requires that 'pandas' be installed within the Python environment that this is running on.

create_connection(db_output_name)

Create database connection object.

If database does not exist it will create the database.

Parameters:

Name Type Description Default
db_output_name(str)

Name of db to connect to or create

required

Returns:

Name Type Description
conn object

SQLite3 connection object

Source code in report_generator/excel_extraction/excel_to_sql.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def create_connection(db_output_name: str) -> object:
    """Create database connection object.

    If database does not exist it will create the database.

    Args:
        db_output_name(str): Name of db to connect to or create

    Returns:
        conn(object): SQLite3 connection object
    """
    conn = None

    try:
        conn = sqlite3.connect(db_output_name)
    except Error as e:
        logger.error(e)

    return conn

create_data_frame(path_to_dataset)

Create datafrane from dataset.

Loads excel file and returns Pandas DataFrame obj

Parameters:

Name Type Description Default
path_to_data_set(str)

file path string

required

Returns:

Name Type Description
data_frame object

Pandas DataFrame object

Source code in report_generator/excel_extraction/excel_to_sql.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def create_data_frame(path_to_dataset: str):
    """Create datafrane from dataset.

    Loads excel file and returns Pandas DataFrame obj

    Args:
        path_to_data_set(str): file path string

    Returns:
        data_frame(object): Pandas DataFrame object
    """
    logger.info("Created DataFrame")
    logger.info(path_to_dataset)
    data_frame = None
    try:
        data_frame = pandas.read_excel(path_to_dataset)

    except FileNotFoundError as e:
        logger.error("Failed to open excel file")
        logger.error(e)

    return data_frame

create_tables(conn)

Create tables in database.

Gets sql string to pass to cursor to create tables for database

Parameters:

Name Type Description Default
conn(object)

sqlite3 connection object

required
Source code in report_generator/excel_extraction/excel_to_sql.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def create_tables(conn: object) -> None:
    """Create tables in database.

    Gets sql string to pass to cursor to create tables for database

    Args:
        conn(object): sqlite3 connection object
    """
    logger.info("Creating tables")
    tables_list = tables.get_tables_sql()

    try:
        cursor = conn.cursor()
        for table in tables_list:
            cursor.execute(table)
        cursor.close()
    except Error as e:
        logger.error(e)

export_to_database(path_to_excel, db_output_name=None)

Export data from dataset to database.

Takes a path to excel file, opens it, processes it, creates a sqlite db, creates tables, populates database

Parameters:

Name Type Description Default
path_to_excel(str)

file path string

required
db_output_name(str)

db file name

required
Source code in report_generator/excel_extraction/excel_to_sql.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def export_to_database(
    path_to_excel: str,
    db_output_name: str = None,
) -> None:
    """Export data from dataset to database.

    Takes a path to excel file, opens it, processes it, creates a sqlite db,
    creates tables, populates database

    Args:
        path_to_excel(str): file path string
        db_output_name(str): db file name
    """
    logger.info("Export to Database Start")
    pandas.options.mode.chained_assignment = None
    if db_output_name is None:
        dtstr = datetime.datetime.now().strftime("%m-%d-%Y_%H:%M:%S")
        db_output_name = f"databases/dataset_{dtstr}.db"
    # Open excel
    data_frame = None
    try:
        # Create and clean data
        data_frame = create_data_frame(path_to_excel)
        clean_data_frame = clean_data(data_frame)

        # Update the locations
        updated_data_frame = update_location(clean_data_frame)

        # Create the database/database connection
        conn = create_connection(db_output_name)

        # Creates tables/Makes sure tables are created
        create_tables(conn)

        # Structures data from dataframe to match tables layout
        structured_data = structure_data(updated_data_frame)

        # Populate tables
        populate_tables(structured_data, conn)

    except FileNotFoundError as e:
        logger.error(e)
    logger.info("Export to database end.")

main(path_to_file, output)

Run main method.

Main function for excel to sql

Parameters:

Name Type Description Default
path_to_file(str)

File path string.

required
Source code in report_generator/excel_extraction/excel_to_sql.py
173
174
175
176
177
178
179
180
181
def main(path_to_file: str, output: str) -> None:
    """Run main method.

    Main function for excel to sql

    Args:
        path_to_file(str): File path string.
    """
    export_to_database(path_to_file, output)

populate_table(table_name, table_data, conn)

Populate database tables with data.

Takes args and calls Pandas.DataFrame.to_sql Method

Parameters:

Name Type Description Default
table_name(str)

name of table for data to be passed to

required
table_data pandas.DataFrame

Panda's DataFrame object containing data to go in table

required
conn sqlite3.Connection

sqlite3 connection object

required
Source code in report_generator/excel_extraction/excel_to_sql.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def populate_table(
    table_name: str, table_data: pandas.DataFrame, conn: sqlite3.Connection
) -> None:
    """Populate database tables with data.

    Takes args and calls Pandas.DataFrame.to_sql Method

    Args:
        table_name(str):                 name of table for data to be passed to
        table_data (pandas.DataFrame):   Panda's DataFrame object containing data to go in table
        conn (sqlite3.Connection):       sqlite3 connection object
    """
    if table_name == "species":
        table_data["elevation_min"] = pandas.to_numeric(table_data["elevation_min"])

    table_data.to_sql(table_name, conn, index=False, if_exists="append")

populate_tables(structured_data, conn)

Populate database with data from structured_data.

Takes structured data and sqlite3 connector object loops through structured_data obj and passes values to the populate_table method

Parameters:

Name Type Description Default
structured_data dict

dict made up of Panda's DataFrame objects

required
conn sqlite3.Connection

sqlite3 connector object

required
Source code in report_generator/excel_extraction/excel_to_sql.py
116
117
118
119
120
121
122
123
124
125
126
127
128
def populate_tables(structured_data: dict, conn: sqlite3.Connection) -> None:
    """Populate database with data from structured_data.

    Takes structured data and sqlite3 connector object loops through
    structured_data obj and passes values to the populate_table method

    Args:
        structured_data (dict):     dict made up of Panda's DataFrame objects
        conn (sqlite3.Connection):  sqlite3 connector object
    """
    logger.info("Populating tables")
    for table_name, table_data in structured_data.items():
        populate_table(table_name, table_data, conn)