Skip to content

Commit

Permalink
internal: large test refactor and readme updates
Browse files Browse the repository at this point in the history
  • Loading branch information
kai-tub committed Jul 9, 2024
1 parent bb41370 commit c3d0966
Show file tree
Hide file tree
Showing 81 changed files with 160 additions and 112 deletions.
44 changes: 23 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,24 +70,26 @@ where the dictionary's key is the band name (`B01`, `B12`, `VV`, ...).
<summary>Example Input</summary>

```
<S1_ROOT_DIR>
└── S1A_IW_GRDH_1SDV_20170613T165043_33UUP_65_63
├── S1A_IW_GRDH_1SDV_20170613T165043_33UUP_65_63_VH.tif
└── S1A_IW_GRDH_1SDV_20170613T165043_33UUP_65_63_VV.tif
<S2_ROOT_DIR>
└── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B01.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B02.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B03.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B04.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B05.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B06.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B07.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B08.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B09.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B8A.tiff
├── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B11.tiff
└── S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23_B12.tiff
├── <S1_ROOT_DIR>
│ └── S1A_IW_GRDH_1SDV_20170613T165043
│ └── S1A_IW_GRDH_1SDV_20170613T165043_33UUP_70_48
│ ├── S1A_IW_GRDH_1SDV_20170613T165043_33UUP_70_48_VH.tif
│ └── S1A_IW_GRDH_1SDV_20170613T165043_33UUP_70_48_VV.tif
└── <S2_ROOT_DIR>
└── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP
└── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B01.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B02.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B03.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B04.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B05.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B06.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B07.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B08.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B09.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B8A.tif
├── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B11.tif
└── S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43_B12.tif
```

</details>
Expand All @@ -96,12 +98,12 @@ where the dictionary's key is the band name (`B01`, `B12`, `VV`, ...).
<summary>LMDB Result</summary>

```
'S1A_IW_GRDH_1SDV_20170613T165043_33UUP_65_63':
'S1A_IW_GRDH_1SDV_20170613T165043_33UUP_70_48':
{
'VH': <120x120 float32 safetensors image data>
'VV': <120x120 float32 safetensors image data>
},
'S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23':
'S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP':
{
'B01': <120x120 uint16 safetensors image data>,
'B02': <120x120 uint16 safetensors image data>,
Expand Down Expand Up @@ -138,7 +140,7 @@ env = lmdb.open(str(encoded_path), readonly=True)

with env.begin() as txn:
# string encoding is required to map the string to an LMDB key
safetensor_dict = load(txn.get("S2A_MSIL2A_20180526T100031_N9999_R122_T34WFU_14_23".encode()))
safetensor_dict = load(txn.get("S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP".encode()))

rgb_bands = ["B04", "B03", "B02"]
rgb_tensor = np.stack([safetensor_dict[b] for b in rgb_bands])
Expand Down
1 change: 1 addition & 0 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
})
pkgs.poetry
pkgs.quarto
pkgs.gdal
]
++ self.checks.${system}.pre-commit-check.enabledPackages;
})
Expand Down
Binary file modified integration_tests/BigEarthNet_LMDB/data.mdb
Binary file not shown.
Binary file modified integration_tests/BigEarthNet_LMDB/lock.mdb
Binary file not shown.
221 changes: 130 additions & 91 deletions integration_tests/test_python_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import pytest
import subprocess
import hashlib
from rico_hdl.rico_hdl import EUROSAT_MS_BANDS


def read_single_band_raster(path):
Expand Down Expand Up @@ -138,7 +137,7 @@ def encoded_eurosat_ms_path(eurosat_ms_root, tmpdir_factory) -> Path:
return Path(tmp_path)


def test_bigearthnet_integration(
def test_reproducibility_and_data_consistency(
s1_root, s2_root, encoded_bigearthnet_s1_s2_path, bigearthnet_lmdb_ref_path
):
s1_data = {file: read_single_band_raster(file) for file in s1_root.glob("**/*.tif")}
Expand Down Expand Up @@ -175,123 +174,163 @@ def test_bigearthnet_integration(
), "The newly generated LMDB file has a different hash compared to the reference one!"


def read_all_hyspecnet_bands(path):
"""
Given a path to a GeoTIFF return all bands as a dictionary,
where the key is the unformatted band index (starting from 1)
as a string and the value the array data
"""
with rasterio.open(path) as r:
return {f"B{i}": r.read(i) for i in range(1, r.count + 1)}
def test_bigearthnet_integration(
s1_root, s2_root, encoded_bigearthnet_s1_s2_path, bigearthnet_lmdb_ref_path
):
env = lmdb.open(str(encoded_bigearthnet_s1_s2_path), readonly=True)

with env.begin(write=False) as txn:
cur = txn.cursor()
decoded_lmdb_data = {k.decode("utf-8"): load(v) for (k, v) in cur}

assert decoded_lmdb_data.keys() == set(
[
"S1A_IW_GRDH_1SDV_20170613T165043_33UUP_70_48",
"S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43",
]
)

sample_s1_safetensors_dict = decoded_lmdb_data.get(
"S1A_IW_GRDH_1SDV_20170613T165043_33UUP_70_48"
)
sample_s2_safetensors_dict = decoded_lmdb_data.get(
"S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_75_43"
)
safetensors_s1_keys = sample_s1_safetensors_dict.keys()
safetensors_s2_keys = sample_s2_safetensors_dict.keys()
assert (
set(
[
"B01",
"B02",
"B03",
"B04",
"B05",
"B06",
"B07",
"B08",
"B8A",
"B09",
"B11",
"B12",
]
)
== safetensors_s2_keys
)
assert (
set(
[
"VV",
"VH",
]
)
== safetensors_s1_keys
)

assert all(arr.shape == (120, 120) for arr in sample_s1_safetensors_dict.values())
assert all(arr.dtype == "float32" for arr in sample_s1_safetensors_dict.values())

assert all(arr.dtype == "uint16" for arr in sample_s2_safetensors_dict.values())
assert all(
sample_s2_safetensors_dict[key].shape == (120, 120)
for key in ["B02", "B03", "B04", "B08"]
)
assert all(
sample_s2_safetensors_dict[key].shape == (60, 60)
for key in ["B05", "B06", "B07", "B8A", "B11", "B12"]
)
assert all(
sample_s2_safetensors_dict[key].shape == (20, 20) for key in ["B01", "B09"]
)


def test_hyspecnet_integration(hyspecnet_root, encoded_hyspecnet_path):
source_file_data = {
file: read_all_hyspecnet_bands(file)
for file in hyspecnet_root.glob("**/*SPECTRAL_IMAGE.TIF")
}
assert len(source_file_data) > 0

# code to create the directory
# ./result/bin/encoder --hyspecnet-11k <PATH> hyspec_artifacts/
env = lmdb.open(str(encoded_hyspecnet_path), readonly=True)

with env.begin(write=False) as txn:
cur = txn.cursor()
decoded_lmdb_data = {k.decode("utf-8"): load(v) for (k, v) in cur}

# The encoded data is nested inside of another safetensor dictionary,
# where the inner keys are derived from the band number as a string
decoded_dicts = [d for d in decoded_lmdb_data.values()]
lmdb_keys = decoded_lmdb_data.keys()

# Simply check if the data remains identical, as this is the only _true_ thing I care about from the Python viewpoint
# Here I iterate over all file name and raster data as dictionaries pairs
# and then for each raster data dictionary iterate over all key-value pairs, where the key is the band name
# in the same style as the LMDB file and check if the LMDB file contained a matching array from
# a safetensors dictionary accessed via the shared band name as key.
for source_file, source_data_dict in source_file_data.items():
for source_key, source_data in source_data_dict.items():
assert any(
np.array_equal(source_data, decoded_dict[source_key])
for decoded_dict in decoded_dicts
), f"Couldn't find data in the LMDB database that matches the data from: {source_file}:{source_key}"


def read_all_uc_merced_bands(path):
"""
Given a path to a UC Merced TIFF file return all bands as a dictionary,
where the keys are the color value
"""
with rasterio.open(path) as r:
return {key: r.read(i) for i, key in enumerate(["Red", "Green", "Blue"], 1)}
# only have two samples
assert len(lmdb_keys) == 2

assert (
"ENMAP01-____L2A-DT0000004950_20221103T162438Z_001_V010110_20221118T145147Z-Y01460273_X03110438"
in lmdb_keys
)
assert (
"ENMAP01-____L2A-DT0000004950_20221103T162438Z_001_V010110_20221118T145147Z-Y01460273_X04390566"
in lmdb_keys
)

sample_safetensors_dict = decoded_lmdb_data.get(
"ENMAP01-____L2A-DT0000004950_20221103T162438Z_001_V010110_20221118T145147Z-Y01460273_X03110438"
)
safetensors_keys = sample_safetensors_dict.keys()
assert "B1" in safetensors_keys
assert "B100" in safetensors_keys
assert "B224" in safetensors_keys

assert "B0" not in safetensors_keys
assert "B01" not in safetensors_keys
assert "B225" not in safetensors_keys

assert all(arr.shape == (128, 128) for arr in sample_safetensors_dict.values())
assert all(arr.dtype == "int16" for arr in sample_safetensors_dict.values())


@pytest.mark.filterwarnings("ignore:Dataset has no geotransform")
def test_uc_merced_integration(uc_merced_root, encoded_uc_merced_path):
source_file_data = {
file: read_all_uc_merced_bands(file) for file in uc_merced_root.glob("**/*.tif")
}
assert len(source_file_data) > 0

# code to create the directory
# ./result/bin/encoder --hyspecnet-11k <PATH> hyspec_artifacts/
env = lmdb.open(str(encoded_uc_merced_path), readonly=True)

with env.begin(write=False) as txn:
cur = txn.cursor()
decoded_lmdb_data = {k.decode("utf-8"): load(v) for (k, v) in cur}

# The encoded data is nested inside of another safetensor dictionary,
# where the inner keys are derived from color mapping
decoded_dicts = [d for d in decoded_lmdb_data.values()]
lmdb_keys = decoded_lmdb_data.keys()
assert lmdb_keys == set(["airplane00", "airplane42", "forest10", "forest99"])

# Simply check if the data remains identical, as this is the only _true_ thing I care about from the Python viewpoint
# Here I iterate over all file name and raster data as dictionaries pairs
# and then for each raster data dictionary iterate over all key-value pairs, where the key is the band name
# in the same style as the LMDB file and check if the LMDB file contained a matching array from
# a safetensors dictionary accessed via the shared band name as key.
for source_file, source_data_dict in source_file_data.items():
for source_key, source_data in source_data_dict.items():
assert any(
np.array_equal(source_data, decoded_dict[source_key])
for decoded_dict in decoded_dicts
), f"Couldn't find data in the LMDB database that matches the data from: {source_file}:{source_key}"


def read_all_eurosat_ms_bands(path):
"""
Given a path to a TIFF file return all bands as a dictionary,
where the keys are the EuroSAT MS band value
"""
with rasterio.open(path) as r:
return {key: r.read(i) for i, key in enumerate(EUROSAT_MS_BANDS, start=1)}
sample_safetensors_dict = decoded_lmdb_data.get("airplane00")
safetensors_keys = sample_safetensors_dict.keys()
assert set(["Red", "Green", "Blue"]) == safetensors_keys

assert all(arr.shape == (256, 256) for arr in sample_safetensors_dict.values())
assert all(arr.dtype == "uint8" for arr in sample_safetensors_dict.values())

def test_eurosat_integration(eurosat_ms_root, encoded_eurosat_ms_path):
source_file_data = {
file: read_all_eurosat_ms_bands(file)
for file in eurosat_ms_root.glob("**/*.tif")
}
assert len(source_file_data) > 0

def test_eurosat_integration(eurosat_ms_root, encoded_eurosat_ms_path):
env = lmdb.open(str(encoded_eurosat_ms_path), readonly=True)

with env.begin(write=False) as txn:
cur = txn.cursor()
decoded_lmdb_data = {k.decode("utf-8"): load(v) for (k, v) in cur}

# The encoded data is nested inside of another safetensor dictionary,
# where the inner keys are derived from color mapping
decoded_dicts = [d for d in decoded_lmdb_data.values()]
decoded_lmdb_data.keys() == set(["AnnualCrop_1", "Pasture_300", "SeaLake_3000"])

# Simply check if the data remains identical, as this is the only _true_ thing I care about from the Python viewpoint
# Here I iterate over all file name and raster data as dictionaries pairs
# and then for each raster data dictionary iterate over all key-value pairs, where the key is the band name
# in the same style as the LMDB file and check if the LMDB file contained a matching array from
# a safetensors dictionary accessed via the shared band name as key.
for source_file, source_data_dict in source_file_data.items():
for source_key, source_data in source_data_dict.items():
assert any(
np.array_equal(source_data, decoded_dict[source_key])
for decoded_dict in decoded_dicts
), f"Couldn't find data in the LMDB database that matches the data from: {source_file}:{source_key}"
sample_safetensors_dict = decoded_lmdb_data.get("AnnualCrop_1")
safetensors_keys = sample_safetensors_dict.keys()
assert (
set(
[
"B01",
"B02",
"B03",
"B04",
"B05",
"B06",
"B07",
"B08",
"B09",
"B10",
"B11",
"B12",
"B08A",
]
)
== safetensors_keys
)

assert all(arr.shape == (64, 64) for arr in sample_safetensors_dict.values())
assert all(arr.dtype == "uint16" for arr in sample_safetensors_dict.values())
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"GRD_Post_Processing_facility_country": "Germany", "GRD_Post_Processing_facility_name": "Copernicus S1 Core Ground Segment - DPA", "GRD_Post_Processing_facility_org": "ESA", "GRD_Post_Processing_facility_site": "DLR-Oberpfaffenhofen", "GRD_Post_Processing_software_name": "Sentinel-1 IPF", "GRD_Post_Processing_software_version": "003.20", "GRD_Post_Processing_start": 1591515136232, "GRD_Post_Processing_stop": 1591515805000, "S1TBX_Calibration_vers": "7.0.2", "S1TBX_SAR_Processing_vers": "7.0.2", "SLC_Processing_facility_country": "Germany", "SLC_Processing_facility_name": "Copernicus S1 Core Ground Segment - DPA", "SLC_Processing_facility_org": "ESA", "SLC_Processing_facility_site": "DLR-Oberpfaffenhofen", "SLC_Processing_software_name": "Sentinel-1 IPF", "SLC_Processing_software_version": "003.20", "SLC_Processing_start": 1591515262000, "SLC_Processing_stop": 1591515588000, "SNAP_Graph_Processing_Framework_GPF_vers": "7.0.3", "cycleNumber": 202, "familyName": "SENTINEL-1", "instrument": "Synthetic Aperture Radar", "instrumentConfigurationID": 6, "instrumentMode": "IW", "instrumentSwath": "IW", "missionDataTakeID": 249786, "nssdcIdentifier": "2014-016A", "orbitNumber_start": 32904, "orbitNumber_stop": 32904, "orbitProperties_ascendingNodeTime": 1591489636263, "orbitProperties_pass": "DESCENDING", "phaseIdentifier": 1, "platform_number": "A", "productClass": "S", "productClassDescription": "SAR Standard L1 Product", "productComposition": "Slice", "productTimelinessCategory": "Fast-24h", "productType": "GRD", "relativeOrbitNumber_start": 107, "relativeOrbitNumber_stop": 107, "resolution": "H", "resolution_meters": 10, "segmentStartTime": 1591491826734, "sliceNumber": 11, "sliceProductFlag": "true", "startTimeANX": 2444172, "stopTimeANX": 2469170, "system:asset_size": 4068315129, "system:band_names": ["VV", "VH", "angle"], "system:bands": {"VV": {"data_type": {"type": "PixelType", "precision": "double"}, "dimensions": [28796, 21782], "crs": "EPSG:32643", "crs_transform": [10, 0, 158264.4428845317, 0, -10, 3560206.2252167463]}, "VH": {"data_type": {"type": "PixelType", "precision": "double"}, "dimensions": [28796, 21782], "crs": "EPSG:32643", "crs_transform": [10, 0, 158264.4428845317, 0, -10, 3560206.2252167463]}, "angle": {"data_type": {"type": "PixelType", "precision": "float"}, "dimensions": [21, 10], "crs": "EPSG:32643", "crs_transform": [-12649.568506065058, -4004.0974110317184, 445997.78946215136, 2497.228151544463, -20014.136037246324, 3509351.2937052143]}}, "system:footprint": {"type": "LinearRing", "coordinates": [[74.09944366808917, 30.322941105920485], [74.11179673580456, 30.37799552291308], [74.14121108030363, 30.509195100457447], [74.16920220381431, 30.633901024968573], [74.1918409406636, 30.734743011525897], [74.26580047664909, 31.063259348915857], [74.37671941065409, 31.55430939246343], [74.4111095681767, 31.72167258547177], [73.11483574249691, 31.92723377904315], [72.46694291921783, 32.02465147462241], [71.79330395372021, 32.1221767847333], [71.77892267611422, 32.05015830297902], [71.68045385507203, 31.550154047055074], [71.61975493537936, 31.239760484381762], [71.55935699012439, 30.929354427853166], [71.49891656908433, 30.617200319148733], [72.86781294238061, 30.409808056714127], [74.0749770135709, 30.21425555873582], [74.09944366808917, 30.322941105920485]]}, "system:id": "COPERNICUS/S1_GRD/S1A_IW_GRDH_1SDV_20200607T010800_20200607T010825_032904_03CFBA_D457", "system:index": "S1A_IW_GRDH_1SDV_20200607T010800_20200607T010825_032904_03CFBA_D457", "system:time_end": 1591492080000, "system:time_start": 1591492080000, "system:version": 1640963949863000.0, "totalSlices": 18, "transmitterReceiverPolarisation": ["VV", "VH"]}
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit c3d0966

Please sign in to comment.