Source code for utils.datasetProcessing

import numpy as np
import sys
sys.path.append('../')
import scipy.io
import os
import cv2
import json
from sklearn.cluster import KMeans

# Simple functions commonly used for (pre-)processing of datasets

def get_dataset_height_width_channels(datadir,extension):
    files  = get_dataset_filenames(datadir,extension)
    nfiles = len(files)
    HWC    = np.zeros([nfiles,3])
    for i in range(nfiles):
        img    = cv2.imread(datadir + files[i])
        h,w,c  = img.shape
        HWC[i,0] = h;
        HWC[i,1] = w;
        HWC[i,2] = c;
    return files, HWC
        
def get_dataset_filenames(datadir,extension):
    files    = []
    for file in os.listdir(datadir):
        if file.endswith(extension):
            datum = file
            files = np.append(files,datum)
    return files

[docs]def get_labels_geojson(fname="xView_train.geojson"): """ Processes a WorldView3 GEOJSON file Args: fname: filepath to the GeoJson file. Outputs: Bounding box coordinate array, Chip-name array, and Classes array """ with open(fname) as f: data = json.load(f) coords = np.zeros((len(data['features']),4)) chips = np.zeros((len(data['features'])),dtype="object") classes = np.zeros((len(data['features']))) for i in range(len(data['features'])): if data['features'][i]['properties']['bounds_imcoords'] != []: b_id = data['features'][i]['properties']['image_id'] val = np.array([int(num) for num in data['features'][i]['properties']['bounds_imcoords'].split(",")]) chips[i] = b_id classes[i] = data['features'][i]['properties']['type_id'] coords[i] = val else: chips[i] = 'None' return coords, chips, classes
def strip_image_number_from_filename(imgname,splitchar): # Strips away file extension AND strings prepended to image number that are separated with splitchar # e.g., train_image_1234.tif --> 1234 num = imgname.split(splitchar)[-1] num = int(num.split('.')[0]) return num def strip_geojson(data,ids): # Strip away all data from geojson file except those images whose numbers are specified by ids datafeatureskeep = [] for i in range(len(ids)): datafeaturesI = [d for d in data['features'] if d['properties']['image_id'] == str(ids[i]) + '.tif'] datafeatureskeep = np.append(datafeatureskeep,datafeaturesI) data['features'] = list(datafeatureskeep) return data;
[docs]def determine_number_of_class_members(opt): """ Function to determine the number of elements in each class of a dataset. """ _, _, classes = get_labels_geojson(opt.targetspath) unique_classes = np.setdiff1d( np.unique(classes) , opt.invalid_class_list ) num_classes = len(unique_classes) num_members = np.zeros(num_classes) for i in range(num_classes): num_members[i] = len( np.where(classes == unique_classes[i])[0] ) return unique_classes, num_members
[docs]def determine_common_and_rare_classes(opt): """ Function to determine the common and rare classes in a dataset using 2-means. """ unique_classes, num_members = determine_number_of_class_members(opt) k_classes = KMeans(2,random_state=0).fit(num_members.reshape(-1,1)) rare_classes = unique_classes[ np.where(k_classes.labels_ == 0)[0] ].astype(int) common_classes = unique_classes[ np.where(k_classes.labels_ == 1)[0] ].astype(int) return list(common_classes), list(rare_classes)
[docs]def determine_small_medium_large_classes(opt): """ Function to determine the small/medium/large size classes in a dataset using 3-means. """ coords, _, classes = get_labels_geojson(opt.targetspath) unique_classes, num_members = determine_number_of_class_members(opt) area = (coords[:,2] - coords[:,0]) * (coords[:,3] - coords[:,1]) avg_size_classes = np.zeros(len(unique_classes)) for i in range(len(unique_classes)): idx_i = np.where( classes == unique_classes[i] )[0] avg_size_classes[i] = np.sum(area[idx_i])/len(idx_i) k_areas = KMeans(3,random_state=0).fit(avg_size_classes.reshape(-1,1)) small_classes = unique_classes[ np.where(k_areas.labels_ == 0)[0] ].astype(int) medium_classes = unique_classes[ np.where(k_areas.labels_ == 1)[0] ].astype(int) large_classes = unique_classes[ np.where(k_areas.labels_ == 2)[0] ].astype(int) return list(small_classes), list(medium_classes), list(large_classes)