Read and Examine Data

System Details

# Imports
import pandas as pd
import watermark
import hvplot
import hvplot.pandas  # noqa
import pathlib

# System version
print(watermark.watermark())

Last updated: 2025-07-07T16:43:52.152160+01:00

Python implementation: CPython
Python version       : 3.12.11
IPython version      : 9.4.0

Compiler    : GCC 13.3.0
OS          : Linux
Release     : 6.6.87.2-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 20
Architecture: 64bit

# Package versions
print(watermark.watermark(iversions=True, globals_=globals()))

watermark: 2.5.0
hvplot   : 0.11.3
pandas   : 2.3.0

Paths

# Set base path
BASE = pathlib.Path(r"/home/miah0x41/repos/basra-well-monitoring")

# DATA
DATA = BASE / 'data'

# RAW
RAW = DATA / 'raw'

# PROCESSED
PROCESSED = DATA / 'processed'

# Create directory if it doesn't exist
PROCESSED.mkdir(parents=True, exist_ok=True)

Read Data

# List files
files = list(RAW.iterdir())

# Preview
[f.name for f in files]

['WQ1_0457.csv - Sheet1.csv',
 'well Data 2.csv',
 'WQ1_0024.csv - Sheet1.csv',
 'well Data 1b.csv',
 'well Data 2b.csv',
 'well Data 1.csv']

The first three rows contains the column names in a nested fashion. Read the data with line 2 as the column names.

!head -n 20 "{RAW / files[-1]}"

Data Recovery Area,Closing SDV,,,,,,Data Recovery Area,,Wellhead pressure increase,,,
Time,Flowline Pressure,Time,Wellhead Pressure,Time,Well Temperature,,Time,Flowline Pressure,Time,Wellhead Pressure,Time,Well Temperature
,WQ1_0457_hz-FLP,,WQ1_0457_hz-FTP,,WQ1_0457_hz-FLT,,,WQ1_0024-FLP,,WQ1_0024-FTP,,WQ1_0024-FLT
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
6/1/2025 21:34,223.5195007,6/1/2025 21:34,270.1679382,6/1/2025 21:34,53.50720215,,45658.8992,213.9455414,45658.8992,228.8192291,45658.8992,51.58621216
6/1/2025 21:39,223.5195007,6/1/2025 21:39,270.3991699,6/1/2025 21:39,53.40142822,,45658.90267,214.5240784,45658.90267,228.7321472,45658.90267,51.58621216
6/1/2025 21:44,223.1580505,6/1/2025 21:44,270.3991699,6/1/2025 21:44,53.40142822,,45658.90615,214.5240784,45658.90615,228.7321472,45658.90615,51.87691879
6/1/2025 21:49,223.1580505,6/1/2025 21:49,269.7843933,6/1/2025 21:49,53.34805298,,45658.90962,215.0088196,45658.90962,228.674118,45658.90962,51.87691879
6/1/2025 21:54,223.9614716,6/1/2025 21:54,269.7843933,6/1/2025 21:54,53.34805298,,45658.91309,215.0088196,45658.91309,228.674118,45658.91309,52.09735107
6/1/2025 21:59,223.9614716,6/1/2025 21:59,269.1842651,6/1/2025 21:59,53.31414795,,45658.91656,214.7212524,45658.91656,228.3285675,45658.91656,52.09735107
6/1/2025 22:04,223.332428,6/1/2025 22:04,269.1842651,6/1/2025 22:04,53.31414795,,45658.92003,214.7212524,45658.92003,228.3285675,45658.92003,52.11193466
6/1/2025 22:09,223.332428,6/1/2025 22:09,268.6045837,6/1/2025 22:09,53.34472275,,45658.92351,215.3478546,45658.92351,227.9902039,45658.92351,52.11193466
6/1/2025 22:14,223.5942993,6/1/2025 22:14,268.6045837,6/1/2025 22:14,53.34472275,,45658.92698,215.3478546,45658.92698,227.9902039,45658.92698,52.09765625
6/1/2025 22:19,223.5942993,6/1/2025 22:19,268.6217041,6/1/2025 22:19,53.49682617,,45658.93045,213.5865479,45658.93045,227.9753265,45658.93045,52.09765625
6/1/2025 22:24,223.9290161,6/1/2025 22:24,268.6217041,6/1/2025 22:24,53.49682617,,45658.93392,213.5865479,45658.93392,227.9753265,45658.93392,52.00146484
6/1/2025 22:29,223.9290161,6/1/2025 22:29,271.5575867,6/1/2025 22:29,53.56515503,,45658.9374,214.236084,45658.9374,227.6369019,45658.9374,52.00146484

# Read Well Data
df = pd.read_csv(RAW / 'well Data 1.csv', header=1)

# Preview
df.head(10)

	Time	Flowline Pressure	Time.1	Wellhead Pressure	Time.2	Well Temperature	Unnamed: 6	Time.3	Flowline Pressure.1	Time.4	Wellhead Pressure.1	Time.5	Well Temperature.1
0	NaN	WQ1_0457_hz-FLP	NaN	WQ1_0457_hz-FTP	NaN	WQ1_0457_hz-FLT	NaN	NaN	WQ1_0024-FLP	NaN	WQ1_0024-FTP	NaN	WQ1_0024-FLT
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	6/1/2025 21:34	223.5195007	6/1/2025 21:34	270.1679382	6/1/2025 21:34	53.50720215	NaN	45658.8992	213.9455414	45658.8992	228.8192291	45658.8992	51.58621216
7	6/1/2025 21:39	223.5195007	6/1/2025 21:39	270.3991699	6/1/2025 21:39	53.40142822	NaN	45658.90267	214.5240784	45658.90267	228.7321472	45658.90267	51.58621216
8	6/1/2025 21:44	223.1580505	6/1/2025 21:44	270.3991699	6/1/2025 21:44	53.40142822	NaN	45658.90615	214.5240784	45658.90615	228.7321472	45658.90615	51.87691879
9	6/1/2025 21:49	223.1580505	6/1/2025 21:49	269.7843933	6/1/2025 21:49	53.34805298	NaN	45658.90962	215.0088196	45658.90962	228.674118	45658.90962	51.87691879

# Remove columns with nulls
df = df.dropna(axis=1, how='all')

# Preview
df.head(10)

	Time	Flowline Pressure	Time.1	Wellhead Pressure	Time.2	Well Temperature	Time.3	Flowline Pressure.1	Time.4	Wellhead Pressure.1	Time.5	Well Temperature.1
0	NaN	WQ1_0457_hz-FLP	NaN	WQ1_0457_hz-FTP	NaN	WQ1_0457_hz-FLT	NaN	WQ1_0024-FLP	NaN	WQ1_0024-FTP	NaN	WQ1_0024-FLT
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	6/1/2025 21:34	223.5195007	6/1/2025 21:34	270.1679382	6/1/2025 21:34	53.50720215	45658.8992	213.9455414	45658.8992	228.8192291	45658.8992	51.58621216
7	6/1/2025 21:39	223.5195007	6/1/2025 21:39	270.3991699	6/1/2025 21:39	53.40142822	45658.90267	214.5240784	45658.90267	228.7321472	45658.90267	51.58621216
8	6/1/2025 21:44	223.1580505	6/1/2025 21:44	270.3991699	6/1/2025 21:44	53.40142822	45658.90615	214.5240784	45658.90615	228.7321472	45658.90615	51.87691879
9	6/1/2025 21:49	223.1580505	6/1/2025 21:49	269.7843933	6/1/2025 21:49	53.34805298	45658.90962	215.0088196	45658.90962	228.674118	45658.90962	51.87691879

# Remove rows with nulls
df = df.dropna(axis=0, how='all')

# Preview
df.head(10)

	Time	Flowline Pressure	Time.1	Wellhead Pressure	Time.2	Well Temperature	Time.3	Flowline Pressure.1	Time.4	Wellhead Pressure.1	Time.5	Well Temperature.1
0	NaN	WQ1_0457_hz-FLP	NaN	WQ1_0457_hz-FTP	NaN	WQ1_0457_hz-FLT	NaN	WQ1_0024-FLP	NaN	WQ1_0024-FTP	NaN	WQ1_0024-FLT
6	6/1/2025 21:34	223.5195007	6/1/2025 21:34	270.1679382	6/1/2025 21:34	53.50720215	45658.8992	213.9455414	45658.8992	228.8192291	45658.8992	51.58621216
7	6/1/2025 21:39	223.5195007	6/1/2025 21:39	270.3991699	6/1/2025 21:39	53.40142822	45658.90267	214.5240784	45658.90267	228.7321472	45658.90267	51.58621216
8	6/1/2025 21:44	223.1580505	6/1/2025 21:44	270.3991699	6/1/2025 21:44	53.40142822	45658.90615	214.5240784	45658.90615	228.7321472	45658.90615	51.87691879
9	6/1/2025 21:49	223.1580505	6/1/2025 21:49	269.7843933	6/1/2025 21:49	53.34805298	45658.90962	215.0088196	45658.90962	228.674118	45658.90962	51.87691879
10	6/1/2025 21:54	223.9614716	6/1/2025 21:54	269.7843933	6/1/2025 21:54	53.34805298	45658.91309	215.0088196	45658.91309	228.674118	45658.91309	52.09735107
11	6/1/2025 21:59	223.9614716	6/1/2025 21:59	269.1842651	6/1/2025 21:59	53.31414795	45658.91656	214.7212524	45658.91656	228.3285675	45658.91656	52.09735107
12	6/1/2025 22:04	223.332428	6/1/2025 22:04	269.1842651	6/1/2025 22:04	53.31414795	45658.92003	214.7212524	45658.92003	228.3285675	45658.92003	52.11193466
13	6/1/2025 22:09	223.332428	6/1/2025 22:09	268.6045837	6/1/2025 22:09	53.34472275	45658.92351	215.3478546	45658.92351	227.9902039	45658.92351	52.11193466
14	6/1/2025 22:14	223.5942993	6/1/2025 22:14	268.6045837	6/1/2025 22:14	53.34472275	45658.92698	215.3478546	45658.92698	227.9902039	45658.92698	52.09765625

# Remove first row
df = df.iloc[1:]

# Preview
df.head(10)

	Time	Flowline Pressure	Time.1	Wellhead Pressure	Time.2	Well Temperature	Time.3	Flowline Pressure.1	Time.4	Wellhead Pressure.1	Time.5	Well Temperature.1
6	6/1/2025 21:34	223.5195007	6/1/2025 21:34	270.1679382	6/1/2025 21:34	53.50720215	45658.8992	213.9455414	45658.8992	228.8192291	45658.8992	51.58621216
7	6/1/2025 21:39	223.5195007	6/1/2025 21:39	270.3991699	6/1/2025 21:39	53.40142822	45658.90267	214.5240784	45658.90267	228.7321472	45658.90267	51.58621216
8	6/1/2025 21:44	223.1580505	6/1/2025 21:44	270.3991699	6/1/2025 21:44	53.40142822	45658.90615	214.5240784	45658.90615	228.7321472	45658.90615	51.87691879
9	6/1/2025 21:49	223.1580505	6/1/2025 21:49	269.7843933	6/1/2025 21:49	53.34805298	45658.90962	215.0088196	45658.90962	228.674118	45658.90962	51.87691879
10	6/1/2025 21:54	223.9614716	6/1/2025 21:54	269.7843933	6/1/2025 21:54	53.34805298	45658.91309	215.0088196	45658.91309	228.674118	45658.91309	52.09735107
11	6/1/2025 21:59	223.9614716	6/1/2025 21:59	269.1842651	6/1/2025 21:59	53.31414795	45658.91656	214.7212524	45658.91656	228.3285675	45658.91656	52.09735107
12	6/1/2025 22:04	223.332428	6/1/2025 22:04	269.1842651	6/1/2025 22:04	53.31414795	45658.92003	214.7212524	45658.92003	228.3285675	45658.92003	52.11193466
13	6/1/2025 22:09	223.332428	6/1/2025 22:09	268.6045837	6/1/2025 22:09	53.34472275	45658.92351	215.3478546	45658.92351	227.9902039	45658.92351	52.11193466
14	6/1/2025 22:14	223.5942993	6/1/2025 22:14	268.6045837	6/1/2025 22:14	53.34472275	45658.92698	215.3478546	45658.92698	227.9902039	45658.92698	52.09765625
15	6/1/2025 22:19	223.5942993	6/1/2025 22:19	268.6217041	6/1/2025 22:19	53.49682617	45658.93045	213.5865479	45658.93045	227.9753265	45658.93045	52.09765625

Data Types

Convert the multiple Time columns into datetime format.
Convert the remaining data into appropriate formats
Rename the columns

There are two time formats:

6/1/2025 21:39 and 45658.90267
The latter is a serialised Excel format

# Examine data types
df.dtypes

Time                   object
Flowline Pressure      object
Time.1                 object
Wellhead Pressure      object
Time.2                 object
Well Temperature       object
Time.3                 object
Flowline Pressure.1    object
Time.4                 object
Wellhead Pressure.1    object
Time.5                 object
Well Temperature.1     object
dtype: object

# Select time columns for conversion:
cols = [
    'Time',
    'Time.1',
    'Time.2'
]

# Preview
df.loc[:, cols].head()

	Time	Time.1	Time.2
6	6/1/2025 21:34	6/1/2025 21:34	6/1/2025 21:34
7	6/1/2025 21:39	6/1/2025 21:39	6/1/2025 21:39
8	6/1/2025 21:44	6/1/2025 21:44	6/1/2025 21:44
9	6/1/2025 21:49	6/1/2025 21:49	6/1/2025 21:49
10	6/1/2025 21:54	6/1/2025 21:54	6/1/2025 21:54

The most popular value is an empty space; this needs replacing prior to datetime conversion.

# Check for most popular values
df[cols].value_counts().head().index

MultiIndex([(             ' ',              ' ',              ' '),
            ('6/1/2025 21:34', '6/1/2025 21:34', '6/1/2025 21:34'),
            ('6/1/2025 21:39', '6/1/2025 21:39', '6/1/2025 21:39'),
            ('6/1/2025 21:44', '6/1/2025 21:44', '6/1/2025 21:44'),
            ('6/1/2025 21:49', '6/1/2025 21:49', '6/1/2025 21:49')],
           names=['Time', 'Time.1', 'Time.2'])

# Replace blank values with NaT
df[cols] = df[cols].replace(' ', pd.NaT)

# Preview
df[cols].tail()

	Time	Time.1	Time.2
667593	NaT	NaT	NaT
667594	NaT	NaT	NaT
667595	NaT	NaT	NaT
667596	NaT	NaN	NaT
667597	NaT	NaN	NaN

# Convert to datetime objects
for c in cols:
    df[c] = pd.to_datetime(df[c], errors='coerce', format='%m/%d/%Y %H:%M')

# Select the remaining Time columns
cols = [c for c in df.select_dtypes('O').columns if 'Time' in c]

# Preview
cols

['Time.3', 'Time.4', 'Time.5']

# Preview data
df.loc[:, cols].head()

	Time.3	Time.4	Time.5
6	45658.8992	45658.8992	45658.8992
7	45658.90267	45658.90267	45658.90267
8	45658.90615	45658.90615	45658.90615
9	45658.90962	45658.90962	45658.90962
10	45658.91309	45658.91309	45658.91309

# Convert numeric time columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

# Check for most popular values
df[cols].value_counts().head().index

MultiIndex([( 45658.8992,  45658.8992,  45658.8992),
            (45658.90267, 45658.90267, 45658.90267),
            (45658.90615, 45658.90615, 45658.90615),
            (45658.90962, 45658.90962, 45658.90962),
            (45658.91309, 45658.91309, 45658.91309)],
           names=['Time.3', 'Time.4', 'Time.5'])

# Convert time from Excel format
for c in cols:
    df[c] = pd.to_datetime(df[c], origin='1899-12-30', unit='D', errors='coerce')

# Preview data
df.loc[:, cols].head()

	Time.3	Time.4	Time.5
6	2025-01-01 21:34:50.879999982	2025-01-01 21:34:50.879999982	2025-01-01 21:34:50.879999982
7	2025-01-01 21:39:50.688000267	2025-01-01 21:39:50.688000267	2025-01-01 21:39:50.688000267
8	2025-01-01 21:44:51.360000216	2025-01-01 21:44:51.360000216	2025-01-01 21:44:51.360000216
9	2025-01-01 21:49:51.167999870	2025-01-01 21:49:51.167999870	2025-01-01 21:49:51.167999870
10	2025-01-01 21:54:50.976000155	2025-01-01 21:54:50.976000155	2025-01-01 21:54:50.976000155

# Check data types
df.dtypes

Time                   datetime64[ns]
Flowline Pressure              object
Time.1                 datetime64[ns]
Wellhead Pressure              object
Time.2                 datetime64[ns]
Well Temperature               object
Time.3                 datetime64[ns]
Flowline Pressure.1            object
Time.4                 datetime64[ns]
Wellhead Pressure.1            object
Time.5                 datetime64[ns]
Well Temperature.1             object
dtype: object

# Identify remaining columns for conversion
cols = df.select_dtypes('O').columns

# Convert remaining object types into numeric
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

# Check dtypes
df.dtypes

Time                   datetime64[ns]
Flowline Pressure             float64
Time.1                 datetime64[ns]
Wellhead Pressure             float64
Time.2                 datetime64[ns]
Well Temperature              float64
Time.3                 datetime64[ns]
Flowline Pressure.1           float64
Time.4                 datetime64[ns]
Wellhead Pressure.1           float64
Time.5                 datetime64[ns]
Well Temperature.1            float64
dtype: object

# Map new columns names
cols = [
    'time0',
    'WQ1_0457_hz-FLP',
    'time1',
    'WQ1_0457_hz-FTP',
    'time2',
    'WQ1_0457_hz-FLT',
    'time3',
    'WQ1_0024-FLP',
    'time4',
    'WQ1_0024-FTP',
    'time5',
    'WQ1_0024-FLT',
]

# Rename columns
df.columns = cols

# Preview
df.head()

	time0	WQ1_0457_hz-FLP	time1	WQ1_0457_hz-FTP	time2	WQ1_0457_hz-FLT	time3	WQ1_0024-FLP	time4	WQ1_0024-FTP	time5	WQ1_0024-FLT
6	2025-06-01 21:34:00	223.519501	2025-06-01 21:34:00	270.167938	2025-06-01 21:34:00	53.507202	2025-01-01 21:34:50.879999982	213.945541	2025-01-01 21:34:50.879999982	228.819229	2025-01-01 21:34:50.879999982	51.586212
7	2025-06-01 21:39:00	223.519501	2025-06-01 21:39:00	270.399170	2025-06-01 21:39:00	53.401428	2025-01-01 21:39:50.688000267	214.524078	2025-01-01 21:39:50.688000267	228.732147	2025-01-01 21:39:50.688000267	51.586212
8	2025-06-01 21:44:00	223.158051	2025-06-01 21:44:00	270.399170	2025-06-01 21:44:00	53.401428	2025-01-01 21:44:51.360000216	214.524078	2025-01-01 21:44:51.360000216	228.732147	2025-01-01 21:44:51.360000216	51.876919
9	2025-06-01 21:49:00	223.158051	2025-06-01 21:49:00	269.784393	2025-06-01 21:49:00	53.348053	2025-01-01 21:49:51.167999870	215.008820	2025-01-01 21:49:51.167999870	228.674118	2025-01-01 21:49:51.167999870	51.876919
10	2025-06-01 21:54:00	223.961472	2025-06-01 21:54:00	269.784393	2025-06-01 21:54:00	53.348053	2025-01-01 21:54:50.976000155	215.008820	2025-01-01 21:54:50.976000155	228.674118	2025-01-01 21:54:50.976000155	52.097351

Export Data

# Export data
df.to_parquet(PROCESSED / 'well1.pqt')