# Imports
import pandas as pd
import watermark
import hvplot
import hvplot.pandas # noqa
import pathlibRead and Examine Data
System Details
# System version
print(watermark.watermark())Last updated: 2025-07-07T16:43:52.152160+01:00
Python implementation: CPython
Python version : 3.12.11
IPython version : 9.4.0
Compiler : GCC 13.3.0
OS : Linux
Release : 6.6.87.2-microsoft-standard-WSL2
Machine : x86_64
Processor : x86_64
CPU cores : 20
Architecture: 64bit
# Package versions
print(watermark.watermark(iversions=True, globals_=globals()))watermark: 2.5.0
hvplot : 0.11.3
pandas : 2.3.0
Paths
# Set base path
BASE = pathlib.Path(r"/home/miah0x41/repos/basra-well-monitoring")
# DATA
DATA = BASE / 'data'
# RAW
RAW = DATA / 'raw'
# PROCESSED
PROCESSED = DATA / 'processed'
# Create directory if it doesn't exist
PROCESSED.mkdir(parents=True, exist_ok=True)Read Data
# List files
files = list(RAW.iterdir())
# Preview
[f.name for f in files]['WQ1_0457.csv - Sheet1.csv',
'well Data 2.csv',
'WQ1_0024.csv - Sheet1.csv',
'well Data 1b.csv',
'well Data 2b.csv',
'well Data 1.csv']
The first three rows contains the column names in a nested fashion. Read the data with line 2 as the column names.
!head -n 20 "{RAW / files[-1]}"Data Recovery Area,Closing SDV,,,,,,Data Recovery Area,,Wellhead pressure increase,,,
Time,Flowline Pressure,Time,Wellhead Pressure,Time,Well Temperature,,Time,Flowline Pressure,Time,Wellhead Pressure,Time,Well Temperature
,WQ1_0457_hz-FLP,,WQ1_0457_hz-FTP,,WQ1_0457_hz-FLT,,,WQ1_0024-FLP,,WQ1_0024-FTP,,WQ1_0024-FLT
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
6/1/2025 21:34,223.5195007,6/1/2025 21:34,270.1679382,6/1/2025 21:34,53.50720215,,45658.8992,213.9455414,45658.8992,228.8192291,45658.8992,51.58621216
6/1/2025 21:39,223.5195007,6/1/2025 21:39,270.3991699,6/1/2025 21:39,53.40142822,,45658.90267,214.5240784,45658.90267,228.7321472,45658.90267,51.58621216
6/1/2025 21:44,223.1580505,6/1/2025 21:44,270.3991699,6/1/2025 21:44,53.40142822,,45658.90615,214.5240784,45658.90615,228.7321472,45658.90615,51.87691879
6/1/2025 21:49,223.1580505,6/1/2025 21:49,269.7843933,6/1/2025 21:49,53.34805298,,45658.90962,215.0088196,45658.90962,228.674118,45658.90962,51.87691879
6/1/2025 21:54,223.9614716,6/1/2025 21:54,269.7843933,6/1/2025 21:54,53.34805298,,45658.91309,215.0088196,45658.91309,228.674118,45658.91309,52.09735107
6/1/2025 21:59,223.9614716,6/1/2025 21:59,269.1842651,6/1/2025 21:59,53.31414795,,45658.91656,214.7212524,45658.91656,228.3285675,45658.91656,52.09735107
6/1/2025 22:04,223.332428,6/1/2025 22:04,269.1842651,6/1/2025 22:04,53.31414795,,45658.92003,214.7212524,45658.92003,228.3285675,45658.92003,52.11193466
6/1/2025 22:09,223.332428,6/1/2025 22:09,268.6045837,6/1/2025 22:09,53.34472275,,45658.92351,215.3478546,45658.92351,227.9902039,45658.92351,52.11193466
6/1/2025 22:14,223.5942993,6/1/2025 22:14,268.6045837,6/1/2025 22:14,53.34472275,,45658.92698,215.3478546,45658.92698,227.9902039,45658.92698,52.09765625
6/1/2025 22:19,223.5942993,6/1/2025 22:19,268.6217041,6/1/2025 22:19,53.49682617,,45658.93045,213.5865479,45658.93045,227.9753265,45658.93045,52.09765625
6/1/2025 22:24,223.9290161,6/1/2025 22:24,268.6217041,6/1/2025 22:24,53.49682617,,45658.93392,213.5865479,45658.93392,227.9753265,45658.93392,52.00146484
6/1/2025 22:29,223.9290161,6/1/2025 22:29,271.5575867,6/1/2025 22:29,53.56515503,,45658.9374,214.236084,45658.9374,227.6369019,45658.9374,52.00146484
# Read Well Data
df = pd.read_csv(RAW / 'well Data 1.csv', header=1)# Preview
df.head(10)| Time | Flowline Pressure | Time.1 | Wellhead Pressure | Time.2 | Well Temperature | Unnamed: 6 | Time.3 | Flowline Pressure.1 | Time.4 | Wellhead Pressure.1 | Time.5 | Well Temperature.1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | WQ1_0457_hz-FLP | NaN | WQ1_0457_hz-FTP | NaN | WQ1_0457_hz-FLT | NaN | NaN | WQ1_0024-FLP | NaN | WQ1_0024-FTP | NaN | WQ1_0024-FLT |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | 6/1/2025 21:34 | 223.5195007 | 6/1/2025 21:34 | 270.1679382 | 6/1/2025 21:34 | 53.50720215 | NaN | 45658.8992 | 213.9455414 | 45658.8992 | 228.8192291 | 45658.8992 | 51.58621216 |
| 7 | 6/1/2025 21:39 | 223.5195007 | 6/1/2025 21:39 | 270.3991699 | 6/1/2025 21:39 | 53.40142822 | NaN | 45658.90267 | 214.5240784 | 45658.90267 | 228.7321472 | 45658.90267 | 51.58621216 |
| 8 | 6/1/2025 21:44 | 223.1580505 | 6/1/2025 21:44 | 270.3991699 | 6/1/2025 21:44 | 53.40142822 | NaN | 45658.90615 | 214.5240784 | 45658.90615 | 228.7321472 | 45658.90615 | 51.87691879 |
| 9 | 6/1/2025 21:49 | 223.1580505 | 6/1/2025 21:49 | 269.7843933 | 6/1/2025 21:49 | 53.34805298 | NaN | 45658.90962 | 215.0088196 | 45658.90962 | 228.674118 | 45658.90962 | 51.87691879 |
# Remove columns with nulls
df = df.dropna(axis=1, how='all')
# Preview
df.head(10)| Time | Flowline Pressure | Time.1 | Wellhead Pressure | Time.2 | Well Temperature | Time.3 | Flowline Pressure.1 | Time.4 | Wellhead Pressure.1 | Time.5 | Well Temperature.1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | WQ1_0457_hz-FLP | NaN | WQ1_0457_hz-FTP | NaN | WQ1_0457_hz-FLT | NaN | WQ1_0024-FLP | NaN | WQ1_0024-FTP | NaN | WQ1_0024-FLT |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | 6/1/2025 21:34 | 223.5195007 | 6/1/2025 21:34 | 270.1679382 | 6/1/2025 21:34 | 53.50720215 | 45658.8992 | 213.9455414 | 45658.8992 | 228.8192291 | 45658.8992 | 51.58621216 |
| 7 | 6/1/2025 21:39 | 223.5195007 | 6/1/2025 21:39 | 270.3991699 | 6/1/2025 21:39 | 53.40142822 | 45658.90267 | 214.5240784 | 45658.90267 | 228.7321472 | 45658.90267 | 51.58621216 |
| 8 | 6/1/2025 21:44 | 223.1580505 | 6/1/2025 21:44 | 270.3991699 | 6/1/2025 21:44 | 53.40142822 | 45658.90615 | 214.5240784 | 45658.90615 | 228.7321472 | 45658.90615 | 51.87691879 |
| 9 | 6/1/2025 21:49 | 223.1580505 | 6/1/2025 21:49 | 269.7843933 | 6/1/2025 21:49 | 53.34805298 | 45658.90962 | 215.0088196 | 45658.90962 | 228.674118 | 45658.90962 | 51.87691879 |
# Remove rows with nulls
df = df.dropna(axis=0, how='all')
# Preview
df.head(10)| Time | Flowline Pressure | Time.1 | Wellhead Pressure | Time.2 | Well Temperature | Time.3 | Flowline Pressure.1 | Time.4 | Wellhead Pressure.1 | Time.5 | Well Temperature.1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | WQ1_0457_hz-FLP | NaN | WQ1_0457_hz-FTP | NaN | WQ1_0457_hz-FLT | NaN | WQ1_0024-FLP | NaN | WQ1_0024-FTP | NaN | WQ1_0024-FLT |
| 6 | 6/1/2025 21:34 | 223.5195007 | 6/1/2025 21:34 | 270.1679382 | 6/1/2025 21:34 | 53.50720215 | 45658.8992 | 213.9455414 | 45658.8992 | 228.8192291 | 45658.8992 | 51.58621216 |
| 7 | 6/1/2025 21:39 | 223.5195007 | 6/1/2025 21:39 | 270.3991699 | 6/1/2025 21:39 | 53.40142822 | 45658.90267 | 214.5240784 | 45658.90267 | 228.7321472 | 45658.90267 | 51.58621216 |
| 8 | 6/1/2025 21:44 | 223.1580505 | 6/1/2025 21:44 | 270.3991699 | 6/1/2025 21:44 | 53.40142822 | 45658.90615 | 214.5240784 | 45658.90615 | 228.7321472 | 45658.90615 | 51.87691879 |
| 9 | 6/1/2025 21:49 | 223.1580505 | 6/1/2025 21:49 | 269.7843933 | 6/1/2025 21:49 | 53.34805298 | 45658.90962 | 215.0088196 | 45658.90962 | 228.674118 | 45658.90962 | 51.87691879 |
| 10 | 6/1/2025 21:54 | 223.9614716 | 6/1/2025 21:54 | 269.7843933 | 6/1/2025 21:54 | 53.34805298 | 45658.91309 | 215.0088196 | 45658.91309 | 228.674118 | 45658.91309 | 52.09735107 |
| 11 | 6/1/2025 21:59 | 223.9614716 | 6/1/2025 21:59 | 269.1842651 | 6/1/2025 21:59 | 53.31414795 | 45658.91656 | 214.7212524 | 45658.91656 | 228.3285675 | 45658.91656 | 52.09735107 |
| 12 | 6/1/2025 22:04 | 223.332428 | 6/1/2025 22:04 | 269.1842651 | 6/1/2025 22:04 | 53.31414795 | 45658.92003 | 214.7212524 | 45658.92003 | 228.3285675 | 45658.92003 | 52.11193466 |
| 13 | 6/1/2025 22:09 | 223.332428 | 6/1/2025 22:09 | 268.6045837 | 6/1/2025 22:09 | 53.34472275 | 45658.92351 | 215.3478546 | 45658.92351 | 227.9902039 | 45658.92351 | 52.11193466 |
| 14 | 6/1/2025 22:14 | 223.5942993 | 6/1/2025 22:14 | 268.6045837 | 6/1/2025 22:14 | 53.34472275 | 45658.92698 | 215.3478546 | 45658.92698 | 227.9902039 | 45658.92698 | 52.09765625 |
# Remove first row
df = df.iloc[1:]
# Preview
df.head(10)| Time | Flowline Pressure | Time.1 | Wellhead Pressure | Time.2 | Well Temperature | Time.3 | Flowline Pressure.1 | Time.4 | Wellhead Pressure.1 | Time.5 | Well Temperature.1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 6/1/2025 21:34 | 223.5195007 | 6/1/2025 21:34 | 270.1679382 | 6/1/2025 21:34 | 53.50720215 | 45658.8992 | 213.9455414 | 45658.8992 | 228.8192291 | 45658.8992 | 51.58621216 |
| 7 | 6/1/2025 21:39 | 223.5195007 | 6/1/2025 21:39 | 270.3991699 | 6/1/2025 21:39 | 53.40142822 | 45658.90267 | 214.5240784 | 45658.90267 | 228.7321472 | 45658.90267 | 51.58621216 |
| 8 | 6/1/2025 21:44 | 223.1580505 | 6/1/2025 21:44 | 270.3991699 | 6/1/2025 21:44 | 53.40142822 | 45658.90615 | 214.5240784 | 45658.90615 | 228.7321472 | 45658.90615 | 51.87691879 |
| 9 | 6/1/2025 21:49 | 223.1580505 | 6/1/2025 21:49 | 269.7843933 | 6/1/2025 21:49 | 53.34805298 | 45658.90962 | 215.0088196 | 45658.90962 | 228.674118 | 45658.90962 | 51.87691879 |
| 10 | 6/1/2025 21:54 | 223.9614716 | 6/1/2025 21:54 | 269.7843933 | 6/1/2025 21:54 | 53.34805298 | 45658.91309 | 215.0088196 | 45658.91309 | 228.674118 | 45658.91309 | 52.09735107 |
| 11 | 6/1/2025 21:59 | 223.9614716 | 6/1/2025 21:59 | 269.1842651 | 6/1/2025 21:59 | 53.31414795 | 45658.91656 | 214.7212524 | 45658.91656 | 228.3285675 | 45658.91656 | 52.09735107 |
| 12 | 6/1/2025 22:04 | 223.332428 | 6/1/2025 22:04 | 269.1842651 | 6/1/2025 22:04 | 53.31414795 | 45658.92003 | 214.7212524 | 45658.92003 | 228.3285675 | 45658.92003 | 52.11193466 |
| 13 | 6/1/2025 22:09 | 223.332428 | 6/1/2025 22:09 | 268.6045837 | 6/1/2025 22:09 | 53.34472275 | 45658.92351 | 215.3478546 | 45658.92351 | 227.9902039 | 45658.92351 | 52.11193466 |
| 14 | 6/1/2025 22:14 | 223.5942993 | 6/1/2025 22:14 | 268.6045837 | 6/1/2025 22:14 | 53.34472275 | 45658.92698 | 215.3478546 | 45658.92698 | 227.9902039 | 45658.92698 | 52.09765625 |
| 15 | 6/1/2025 22:19 | 223.5942993 | 6/1/2025 22:19 | 268.6217041 | 6/1/2025 22:19 | 53.49682617 | 45658.93045 | 213.5865479 | 45658.93045 | 227.9753265 | 45658.93045 | 52.09765625 |
Data Types
- Convert the multiple
Timecolumns intodatetimeformat. - Convert the remaining data into appropriate formats
- Rename the columns
There are two time formats:
6/1/2025 21:39and45658.90267- The latter is a serialised Excel format
# Examine data types
df.dtypesTime object
Flowline Pressure object
Time.1 object
Wellhead Pressure object
Time.2 object
Well Temperature object
Time.3 object
Flowline Pressure.1 object
Time.4 object
Wellhead Pressure.1 object
Time.5 object
Well Temperature.1 object
dtype: object
# Select time columns for conversion:
cols = [
'Time',
'Time.1',
'Time.2'
]
# Preview
df.loc[:, cols].head()| Time | Time.1 | Time.2 | |
|---|---|---|---|
| 6 | 6/1/2025 21:34 | 6/1/2025 21:34 | 6/1/2025 21:34 |
| 7 | 6/1/2025 21:39 | 6/1/2025 21:39 | 6/1/2025 21:39 |
| 8 | 6/1/2025 21:44 | 6/1/2025 21:44 | 6/1/2025 21:44 |
| 9 | 6/1/2025 21:49 | 6/1/2025 21:49 | 6/1/2025 21:49 |
| 10 | 6/1/2025 21:54 | 6/1/2025 21:54 | 6/1/2025 21:54 |
The most popular value is an empty space; this needs replacing prior to datetime conversion.
# Check for most popular values
df[cols].value_counts().head().indexMultiIndex([( ' ', ' ', ' '),
('6/1/2025 21:34', '6/1/2025 21:34', '6/1/2025 21:34'),
('6/1/2025 21:39', '6/1/2025 21:39', '6/1/2025 21:39'),
('6/1/2025 21:44', '6/1/2025 21:44', '6/1/2025 21:44'),
('6/1/2025 21:49', '6/1/2025 21:49', '6/1/2025 21:49')],
names=['Time', 'Time.1', 'Time.2'])
# Replace blank values with NaT
df[cols] = df[cols].replace(' ', pd.NaT)
# Preview
df[cols].tail()| Time | Time.1 | Time.2 | |
|---|---|---|---|
| 667593 | NaT | NaT | NaT |
| 667594 | NaT | NaT | NaT |
| 667595 | NaT | NaT | NaT |
| 667596 | NaT | NaN | NaT |
| 667597 | NaT | NaN | NaN |
# Convert to datetime objects
for c in cols:
df[c] = pd.to_datetime(df[c], errors='coerce', format='%m/%d/%Y %H:%M')# Select the remaining Time columns
cols = [c for c in df.select_dtypes('O').columns if 'Time' in c]
# Preview
cols['Time.3', 'Time.4', 'Time.5']
# Preview data
df.loc[:, cols].head()| Time.3 | Time.4 | Time.5 | |
|---|---|---|---|
| 6 | 45658.8992 | 45658.8992 | 45658.8992 |
| 7 | 45658.90267 | 45658.90267 | 45658.90267 |
| 8 | 45658.90615 | 45658.90615 | 45658.90615 |
| 9 | 45658.90962 | 45658.90962 | 45658.90962 |
| 10 | 45658.91309 | 45658.91309 | 45658.91309 |
# Convert numeric time columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')# Check for most popular values
df[cols].value_counts().head().indexMultiIndex([( 45658.8992, 45658.8992, 45658.8992),
(45658.90267, 45658.90267, 45658.90267),
(45658.90615, 45658.90615, 45658.90615),
(45658.90962, 45658.90962, 45658.90962),
(45658.91309, 45658.91309, 45658.91309)],
names=['Time.3', 'Time.4', 'Time.5'])
# Convert time from Excel format
for c in cols:
df[c] = pd.to_datetime(df[c], origin='1899-12-30', unit='D', errors='coerce')# Preview data
df.loc[:, cols].head()| Time.3 | Time.4 | Time.5 | |
|---|---|---|---|
| 6 | 2025-01-01 21:34:50.879999982 | 2025-01-01 21:34:50.879999982 | 2025-01-01 21:34:50.879999982 |
| 7 | 2025-01-01 21:39:50.688000267 | 2025-01-01 21:39:50.688000267 | 2025-01-01 21:39:50.688000267 |
| 8 | 2025-01-01 21:44:51.360000216 | 2025-01-01 21:44:51.360000216 | 2025-01-01 21:44:51.360000216 |
| 9 | 2025-01-01 21:49:51.167999870 | 2025-01-01 21:49:51.167999870 | 2025-01-01 21:49:51.167999870 |
| 10 | 2025-01-01 21:54:50.976000155 | 2025-01-01 21:54:50.976000155 | 2025-01-01 21:54:50.976000155 |
# Check data types
df.dtypesTime datetime64[ns]
Flowline Pressure object
Time.1 datetime64[ns]
Wellhead Pressure object
Time.2 datetime64[ns]
Well Temperature object
Time.3 datetime64[ns]
Flowline Pressure.1 object
Time.4 datetime64[ns]
Wellhead Pressure.1 object
Time.5 datetime64[ns]
Well Temperature.1 object
dtype: object
# Identify remaining columns for conversion
cols = df.select_dtypes('O').columns# Convert remaining object types into numeric
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')# Check dtypes
df.dtypesTime datetime64[ns]
Flowline Pressure float64
Time.1 datetime64[ns]
Wellhead Pressure float64
Time.2 datetime64[ns]
Well Temperature float64
Time.3 datetime64[ns]
Flowline Pressure.1 float64
Time.4 datetime64[ns]
Wellhead Pressure.1 float64
Time.5 datetime64[ns]
Well Temperature.1 float64
dtype: object
# Map new columns names
cols = [
'time0',
'WQ1_0457_hz-FLP',
'time1',
'WQ1_0457_hz-FTP',
'time2',
'WQ1_0457_hz-FLT',
'time3',
'WQ1_0024-FLP',
'time4',
'WQ1_0024-FTP',
'time5',
'WQ1_0024-FLT',
]# Rename columns
df.columns = cols# Preview
df.head()| time0 | WQ1_0457_hz-FLP | time1 | WQ1_0457_hz-FTP | time2 | WQ1_0457_hz-FLT | time3 | WQ1_0024-FLP | time4 | WQ1_0024-FTP | time5 | WQ1_0024-FLT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 2025-06-01 21:34:00 | 223.519501 | 2025-06-01 21:34:00 | 270.167938 | 2025-06-01 21:34:00 | 53.507202 | 2025-01-01 21:34:50.879999982 | 213.945541 | 2025-01-01 21:34:50.879999982 | 228.819229 | 2025-01-01 21:34:50.879999982 | 51.586212 |
| 7 | 2025-06-01 21:39:00 | 223.519501 | 2025-06-01 21:39:00 | 270.399170 | 2025-06-01 21:39:00 | 53.401428 | 2025-01-01 21:39:50.688000267 | 214.524078 | 2025-01-01 21:39:50.688000267 | 228.732147 | 2025-01-01 21:39:50.688000267 | 51.586212 |
| 8 | 2025-06-01 21:44:00 | 223.158051 | 2025-06-01 21:44:00 | 270.399170 | 2025-06-01 21:44:00 | 53.401428 | 2025-01-01 21:44:51.360000216 | 214.524078 | 2025-01-01 21:44:51.360000216 | 228.732147 | 2025-01-01 21:44:51.360000216 | 51.876919 |
| 9 | 2025-06-01 21:49:00 | 223.158051 | 2025-06-01 21:49:00 | 269.784393 | 2025-06-01 21:49:00 | 53.348053 | 2025-01-01 21:49:51.167999870 | 215.008820 | 2025-01-01 21:49:51.167999870 | 228.674118 | 2025-01-01 21:49:51.167999870 | 51.876919 |
| 10 | 2025-06-01 21:54:00 | 223.961472 | 2025-06-01 21:54:00 | 269.784393 | 2025-06-01 21:54:00 | 53.348053 | 2025-01-01 21:54:50.976000155 | 215.008820 | 2025-01-01 21:54:50.976000155 | 228.674118 | 2025-01-01 21:54:50.976000155 | 52.097351 |
Export Data
# Export data
df.to_parquet(PROCESSED / 'well1.pqt')