Read and Examine Data

System Details

# Imports
import pandas as pd
import watermark
import hvplot
import hvplot.pandas  # noqa
import pathlib
# System version
print(watermark.watermark())
Last updated: 2025-07-07T16:43:52.152160+01:00

Python implementation: CPython
Python version       : 3.12.11
IPython version      : 9.4.0

Compiler    : GCC 13.3.0
OS          : Linux
Release     : 6.6.87.2-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 20
Architecture: 64bit
# Package versions
print(watermark.watermark(iversions=True, globals_=globals()))
watermark: 2.5.0
hvplot   : 0.11.3
pandas   : 2.3.0

Paths

# Set base path
BASE = pathlib.Path(r"/home/miah0x41/repos/basra-well-monitoring")

# DATA
DATA = BASE / 'data'

# RAW
RAW = DATA / 'raw'

# PROCESSED
PROCESSED = DATA / 'processed'

# Create directory if it doesn't exist
PROCESSED.mkdir(parents=True, exist_ok=True)

Read Data

# List files
files = list(RAW.iterdir())

# Preview
[f.name for f in files]
['WQ1_0457.csv - Sheet1.csv',
 'well Data 2.csv',
 'WQ1_0024.csv - Sheet1.csv',
 'well Data 1b.csv',
 'well Data 2b.csv',
 'well Data 1.csv']

The first three rows contains the column names in a nested fashion. Read the data with line 2 as the column names.

!head -n 20 "{RAW / files[-1]}"
Data Recovery Area,Closing SDV,,,,,,Data Recovery Area,,Wellhead pressure increase,,,
Time,Flowline Pressure,Time,Wellhead Pressure,Time,Well Temperature,,Time,Flowline Pressure,Time,Wellhead Pressure,Time,Well Temperature
,WQ1_0457_hz-FLP,,WQ1_0457_hz-FTP,,WQ1_0457_hz-FLT,,,WQ1_0024-FLP,,WQ1_0024-FTP,,WQ1_0024-FLT
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
,,,,,,,,,,,,
6/1/2025 21:34,223.5195007,6/1/2025 21:34,270.1679382,6/1/2025 21:34,53.50720215,,45658.8992,213.9455414,45658.8992,228.8192291,45658.8992,51.58621216
6/1/2025 21:39,223.5195007,6/1/2025 21:39,270.3991699,6/1/2025 21:39,53.40142822,,45658.90267,214.5240784,45658.90267,228.7321472,45658.90267,51.58621216
6/1/2025 21:44,223.1580505,6/1/2025 21:44,270.3991699,6/1/2025 21:44,53.40142822,,45658.90615,214.5240784,45658.90615,228.7321472,45658.90615,51.87691879
6/1/2025 21:49,223.1580505,6/1/2025 21:49,269.7843933,6/1/2025 21:49,53.34805298,,45658.90962,215.0088196,45658.90962,228.674118,45658.90962,51.87691879
6/1/2025 21:54,223.9614716,6/1/2025 21:54,269.7843933,6/1/2025 21:54,53.34805298,,45658.91309,215.0088196,45658.91309,228.674118,45658.91309,52.09735107
6/1/2025 21:59,223.9614716,6/1/2025 21:59,269.1842651,6/1/2025 21:59,53.31414795,,45658.91656,214.7212524,45658.91656,228.3285675,45658.91656,52.09735107
6/1/2025 22:04,223.332428,6/1/2025 22:04,269.1842651,6/1/2025 22:04,53.31414795,,45658.92003,214.7212524,45658.92003,228.3285675,45658.92003,52.11193466
6/1/2025 22:09,223.332428,6/1/2025 22:09,268.6045837,6/1/2025 22:09,53.34472275,,45658.92351,215.3478546,45658.92351,227.9902039,45658.92351,52.11193466
6/1/2025 22:14,223.5942993,6/1/2025 22:14,268.6045837,6/1/2025 22:14,53.34472275,,45658.92698,215.3478546,45658.92698,227.9902039,45658.92698,52.09765625
6/1/2025 22:19,223.5942993,6/1/2025 22:19,268.6217041,6/1/2025 22:19,53.49682617,,45658.93045,213.5865479,45658.93045,227.9753265,45658.93045,52.09765625
6/1/2025 22:24,223.9290161,6/1/2025 22:24,268.6217041,6/1/2025 22:24,53.49682617,,45658.93392,213.5865479,45658.93392,227.9753265,45658.93392,52.00146484
6/1/2025 22:29,223.9290161,6/1/2025 22:29,271.5575867,6/1/2025 22:29,53.56515503,,45658.9374,214.236084,45658.9374,227.6369019,45658.9374,52.00146484
# Read Well Data
df = pd.read_csv(RAW / 'well Data 1.csv', header=1)
# Preview
df.head(10)
Time Flowline Pressure Time.1 Wellhead Pressure Time.2 Well Temperature Unnamed: 6 Time.3 Flowline Pressure.1 Time.4 Wellhead Pressure.1 Time.5 Well Temperature.1
0 NaN WQ1_0457_hz-FLP NaN WQ1_0457_hz-FTP NaN WQ1_0457_hz-FLT NaN NaN WQ1_0024-FLP NaN WQ1_0024-FTP NaN WQ1_0024-FLT
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 6/1/2025 21:34 223.5195007 6/1/2025 21:34 270.1679382 6/1/2025 21:34 53.50720215 NaN 45658.8992 213.9455414 45658.8992 228.8192291 45658.8992 51.58621216
7 6/1/2025 21:39 223.5195007 6/1/2025 21:39 270.3991699 6/1/2025 21:39 53.40142822 NaN 45658.90267 214.5240784 45658.90267 228.7321472 45658.90267 51.58621216
8 6/1/2025 21:44 223.1580505 6/1/2025 21:44 270.3991699 6/1/2025 21:44 53.40142822 NaN 45658.90615 214.5240784 45658.90615 228.7321472 45658.90615 51.87691879
9 6/1/2025 21:49 223.1580505 6/1/2025 21:49 269.7843933 6/1/2025 21:49 53.34805298 NaN 45658.90962 215.0088196 45658.90962 228.674118 45658.90962 51.87691879
# Remove columns with nulls
df = df.dropna(axis=1, how='all')

# Preview
df.head(10)
Time Flowline Pressure Time.1 Wellhead Pressure Time.2 Well Temperature Time.3 Flowline Pressure.1 Time.4 Wellhead Pressure.1 Time.5 Well Temperature.1
0 NaN WQ1_0457_hz-FLP NaN WQ1_0457_hz-FTP NaN WQ1_0457_hz-FLT NaN WQ1_0024-FLP NaN WQ1_0024-FTP NaN WQ1_0024-FLT
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 6/1/2025 21:34 223.5195007 6/1/2025 21:34 270.1679382 6/1/2025 21:34 53.50720215 45658.8992 213.9455414 45658.8992 228.8192291 45658.8992 51.58621216
7 6/1/2025 21:39 223.5195007 6/1/2025 21:39 270.3991699 6/1/2025 21:39 53.40142822 45658.90267 214.5240784 45658.90267 228.7321472 45658.90267 51.58621216
8 6/1/2025 21:44 223.1580505 6/1/2025 21:44 270.3991699 6/1/2025 21:44 53.40142822 45658.90615 214.5240784 45658.90615 228.7321472 45658.90615 51.87691879
9 6/1/2025 21:49 223.1580505 6/1/2025 21:49 269.7843933 6/1/2025 21:49 53.34805298 45658.90962 215.0088196 45658.90962 228.674118 45658.90962 51.87691879
# Remove rows with nulls
df = df.dropna(axis=0, how='all')

# Preview
df.head(10)
Time Flowline Pressure Time.1 Wellhead Pressure Time.2 Well Temperature Time.3 Flowline Pressure.1 Time.4 Wellhead Pressure.1 Time.5 Well Temperature.1
0 NaN WQ1_0457_hz-FLP NaN WQ1_0457_hz-FTP NaN WQ1_0457_hz-FLT NaN WQ1_0024-FLP NaN WQ1_0024-FTP NaN WQ1_0024-FLT
6 6/1/2025 21:34 223.5195007 6/1/2025 21:34 270.1679382 6/1/2025 21:34 53.50720215 45658.8992 213.9455414 45658.8992 228.8192291 45658.8992 51.58621216
7 6/1/2025 21:39 223.5195007 6/1/2025 21:39 270.3991699 6/1/2025 21:39 53.40142822 45658.90267 214.5240784 45658.90267 228.7321472 45658.90267 51.58621216
8 6/1/2025 21:44 223.1580505 6/1/2025 21:44 270.3991699 6/1/2025 21:44 53.40142822 45658.90615 214.5240784 45658.90615 228.7321472 45658.90615 51.87691879
9 6/1/2025 21:49 223.1580505 6/1/2025 21:49 269.7843933 6/1/2025 21:49 53.34805298 45658.90962 215.0088196 45658.90962 228.674118 45658.90962 51.87691879
10 6/1/2025 21:54 223.9614716 6/1/2025 21:54 269.7843933 6/1/2025 21:54 53.34805298 45658.91309 215.0088196 45658.91309 228.674118 45658.91309 52.09735107
11 6/1/2025 21:59 223.9614716 6/1/2025 21:59 269.1842651 6/1/2025 21:59 53.31414795 45658.91656 214.7212524 45658.91656 228.3285675 45658.91656 52.09735107
12 6/1/2025 22:04 223.332428 6/1/2025 22:04 269.1842651 6/1/2025 22:04 53.31414795 45658.92003 214.7212524 45658.92003 228.3285675 45658.92003 52.11193466
13 6/1/2025 22:09 223.332428 6/1/2025 22:09 268.6045837 6/1/2025 22:09 53.34472275 45658.92351 215.3478546 45658.92351 227.9902039 45658.92351 52.11193466
14 6/1/2025 22:14 223.5942993 6/1/2025 22:14 268.6045837 6/1/2025 22:14 53.34472275 45658.92698 215.3478546 45658.92698 227.9902039 45658.92698 52.09765625
# Remove first row
df = df.iloc[1:]

# Preview
df.head(10)
Time Flowline Pressure Time.1 Wellhead Pressure Time.2 Well Temperature Time.3 Flowline Pressure.1 Time.4 Wellhead Pressure.1 Time.5 Well Temperature.1
6 6/1/2025 21:34 223.5195007 6/1/2025 21:34 270.1679382 6/1/2025 21:34 53.50720215 45658.8992 213.9455414 45658.8992 228.8192291 45658.8992 51.58621216
7 6/1/2025 21:39 223.5195007 6/1/2025 21:39 270.3991699 6/1/2025 21:39 53.40142822 45658.90267 214.5240784 45658.90267 228.7321472 45658.90267 51.58621216
8 6/1/2025 21:44 223.1580505 6/1/2025 21:44 270.3991699 6/1/2025 21:44 53.40142822 45658.90615 214.5240784 45658.90615 228.7321472 45658.90615 51.87691879
9 6/1/2025 21:49 223.1580505 6/1/2025 21:49 269.7843933 6/1/2025 21:49 53.34805298 45658.90962 215.0088196 45658.90962 228.674118 45658.90962 51.87691879
10 6/1/2025 21:54 223.9614716 6/1/2025 21:54 269.7843933 6/1/2025 21:54 53.34805298 45658.91309 215.0088196 45658.91309 228.674118 45658.91309 52.09735107
11 6/1/2025 21:59 223.9614716 6/1/2025 21:59 269.1842651 6/1/2025 21:59 53.31414795 45658.91656 214.7212524 45658.91656 228.3285675 45658.91656 52.09735107
12 6/1/2025 22:04 223.332428 6/1/2025 22:04 269.1842651 6/1/2025 22:04 53.31414795 45658.92003 214.7212524 45658.92003 228.3285675 45658.92003 52.11193466
13 6/1/2025 22:09 223.332428 6/1/2025 22:09 268.6045837 6/1/2025 22:09 53.34472275 45658.92351 215.3478546 45658.92351 227.9902039 45658.92351 52.11193466
14 6/1/2025 22:14 223.5942993 6/1/2025 22:14 268.6045837 6/1/2025 22:14 53.34472275 45658.92698 215.3478546 45658.92698 227.9902039 45658.92698 52.09765625
15 6/1/2025 22:19 223.5942993 6/1/2025 22:19 268.6217041 6/1/2025 22:19 53.49682617 45658.93045 213.5865479 45658.93045 227.9753265 45658.93045 52.09765625

Data Types

  • Convert the multiple Time columns into datetime format.
  • Convert the remaining data into appropriate formats
  • Rename the columns

There are two time formats:

  • 6/1/2025 21:39 and 45658.90267
  • The latter is a serialised Excel format
# Examine data types
df.dtypes
Time                   object
Flowline Pressure      object
Time.1                 object
Wellhead Pressure      object
Time.2                 object
Well Temperature       object
Time.3                 object
Flowline Pressure.1    object
Time.4                 object
Wellhead Pressure.1    object
Time.5                 object
Well Temperature.1     object
dtype: object
# Select time columns for conversion:
cols = [
    'Time',
    'Time.1',
    'Time.2'
]

# Preview
df.loc[:, cols].head()
Time Time.1 Time.2
6 6/1/2025 21:34 6/1/2025 21:34 6/1/2025 21:34
7 6/1/2025 21:39 6/1/2025 21:39 6/1/2025 21:39
8 6/1/2025 21:44 6/1/2025 21:44 6/1/2025 21:44
9 6/1/2025 21:49 6/1/2025 21:49 6/1/2025 21:49
10 6/1/2025 21:54 6/1/2025 21:54 6/1/2025 21:54

The most popular value is an empty space; this needs replacing prior to datetime conversion.

# Check for most popular values
df[cols].value_counts().head().index
MultiIndex([(             ' ',              ' ',              ' '),
            ('6/1/2025 21:34', '6/1/2025 21:34', '6/1/2025 21:34'),
            ('6/1/2025 21:39', '6/1/2025 21:39', '6/1/2025 21:39'),
            ('6/1/2025 21:44', '6/1/2025 21:44', '6/1/2025 21:44'),
            ('6/1/2025 21:49', '6/1/2025 21:49', '6/1/2025 21:49')],
           names=['Time', 'Time.1', 'Time.2'])
# Replace blank values with NaT
df[cols] = df[cols].replace(' ', pd.NaT)

# Preview
df[cols].tail()
Time Time.1 Time.2
667593 NaT NaT NaT
667594 NaT NaT NaT
667595 NaT NaT NaT
667596 NaT NaN NaT
667597 NaT NaN NaN
# Convert to datetime objects
for c in cols:
    df[c] = pd.to_datetime(df[c], errors='coerce', format='%m/%d/%Y %H:%M')
# Select the remaining Time columns
cols = [c for c in df.select_dtypes('O').columns if 'Time' in c]

# Preview
cols
['Time.3', 'Time.4', 'Time.5']
# Preview data
df.loc[:, cols].head()
Time.3 Time.4 Time.5
6 45658.8992 45658.8992 45658.8992
7 45658.90267 45658.90267 45658.90267
8 45658.90615 45658.90615 45658.90615
9 45658.90962 45658.90962 45658.90962
10 45658.91309 45658.91309 45658.91309
# Convert numeric time columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
# Check for most popular values
df[cols].value_counts().head().index
MultiIndex([( 45658.8992,  45658.8992,  45658.8992),
            (45658.90267, 45658.90267, 45658.90267),
            (45658.90615, 45658.90615, 45658.90615),
            (45658.90962, 45658.90962, 45658.90962),
            (45658.91309, 45658.91309, 45658.91309)],
           names=['Time.3', 'Time.4', 'Time.5'])
# Convert time from Excel format
for c in cols:
    df[c] = pd.to_datetime(df[c], origin='1899-12-30', unit='D', errors='coerce')
# Preview data
df.loc[:, cols].head()
Time.3 Time.4 Time.5
6 2025-01-01 21:34:50.879999982 2025-01-01 21:34:50.879999982 2025-01-01 21:34:50.879999982
7 2025-01-01 21:39:50.688000267 2025-01-01 21:39:50.688000267 2025-01-01 21:39:50.688000267
8 2025-01-01 21:44:51.360000216 2025-01-01 21:44:51.360000216 2025-01-01 21:44:51.360000216
9 2025-01-01 21:49:51.167999870 2025-01-01 21:49:51.167999870 2025-01-01 21:49:51.167999870
10 2025-01-01 21:54:50.976000155 2025-01-01 21:54:50.976000155 2025-01-01 21:54:50.976000155
# Check data types
df.dtypes
Time                   datetime64[ns]
Flowline Pressure              object
Time.1                 datetime64[ns]
Wellhead Pressure              object
Time.2                 datetime64[ns]
Well Temperature               object
Time.3                 datetime64[ns]
Flowline Pressure.1            object
Time.4                 datetime64[ns]
Wellhead Pressure.1            object
Time.5                 datetime64[ns]
Well Temperature.1             object
dtype: object
# Identify remaining columns for conversion
cols = df.select_dtypes('O').columns
# Convert remaining object types into numeric
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
# Check dtypes
df.dtypes
Time                   datetime64[ns]
Flowline Pressure             float64
Time.1                 datetime64[ns]
Wellhead Pressure             float64
Time.2                 datetime64[ns]
Well Temperature              float64
Time.3                 datetime64[ns]
Flowline Pressure.1           float64
Time.4                 datetime64[ns]
Wellhead Pressure.1           float64
Time.5                 datetime64[ns]
Well Temperature.1            float64
dtype: object
# Map new columns names
cols = [
    'time0',
    'WQ1_0457_hz-FLP',
    'time1',
    'WQ1_0457_hz-FTP',
    'time2',
    'WQ1_0457_hz-FLT',
    'time3',
    'WQ1_0024-FLP',
    'time4',
    'WQ1_0024-FTP',
    'time5',
    'WQ1_0024-FLT',
]
# Rename columns
df.columns = cols
# Preview
df.head()
time0 WQ1_0457_hz-FLP time1 WQ1_0457_hz-FTP time2 WQ1_0457_hz-FLT time3 WQ1_0024-FLP time4 WQ1_0024-FTP time5 WQ1_0024-FLT
6 2025-06-01 21:34:00 223.519501 2025-06-01 21:34:00 270.167938 2025-06-01 21:34:00 53.507202 2025-01-01 21:34:50.879999982 213.945541 2025-01-01 21:34:50.879999982 228.819229 2025-01-01 21:34:50.879999982 51.586212
7 2025-06-01 21:39:00 223.519501 2025-06-01 21:39:00 270.399170 2025-06-01 21:39:00 53.401428 2025-01-01 21:39:50.688000267 214.524078 2025-01-01 21:39:50.688000267 228.732147 2025-01-01 21:39:50.688000267 51.586212
8 2025-06-01 21:44:00 223.158051 2025-06-01 21:44:00 270.399170 2025-06-01 21:44:00 53.401428 2025-01-01 21:44:51.360000216 214.524078 2025-01-01 21:44:51.360000216 228.732147 2025-01-01 21:44:51.360000216 51.876919
9 2025-06-01 21:49:00 223.158051 2025-06-01 21:49:00 269.784393 2025-06-01 21:49:00 53.348053 2025-01-01 21:49:51.167999870 215.008820 2025-01-01 21:49:51.167999870 228.674118 2025-01-01 21:49:51.167999870 51.876919
10 2025-06-01 21:54:00 223.961472 2025-06-01 21:54:00 269.784393 2025-06-01 21:54:00 53.348053 2025-01-01 21:54:50.976000155 215.008820 2025-01-01 21:54:50.976000155 228.674118 2025-01-01 21:54:50.976000155 52.097351

Export Data

# Export data
df.to_parquet(PROCESSED / 'well1.pqt')
Back to top