Visualizing cluster centers given added geo features

# from app import app, db#, cache

from matplotlib.mlab import griddata
import matplotlib.pyplot as plt

import time, os
from collections import defaultdict

import numpy as np
from numpy import linspace
from numpy import meshgrid
from numpy.random import uniform, seed

from mpl_toolkits.basemap import Basemap, shiftgrid
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

title_sz = 27
axis_sz = 22
tick_sz = 21

from matplotlib.ticker import MaxNLocator
from matplotlib import cm, rcParams
rcParams.update({'font.size': tick_sz-4}) # Increase font-size
# from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.cm as cm
from matplotlib import gridspec

import pylab as pl

from itertools import islice
import sys, pickle, time, copy, re

from chem_ocean import Build_Map as bm

import sqlalchemy

import pandas as pd

Visualizing cluster centers given added geo features#

from sklearn.cluster import MiniBatchKMeans as mkm
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

def connect(user, password, db, host='localhost', port=5432):
    '''Returns a connection and a metadata object'''
    url  = 'postgresql://{}:{}@{}:{}/{}'
    url = url.format(user, password, host, port, db)

    # The return value of create_engine() is our connection object
    con = sqlalchemy.create_engine(url, client_encoding='utf8')

    # We then bind the connection to MetaData()
    meta = sqlalchemy.MetaData(bind=con, reflect=True)

    return con, meta, url

conn, meta, url = connect('jlanders', '', 'odv_app_dev', host='localhost', port=5432)

/usr/local/lib/python3.6/site-packages/psycopg2/__init__.py:144: UserWarning: The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use "pip install psycopg2-binary" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.
  """)
/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:10: SADeprecationWarning: reflect=True is deprecate; please use the reflect() method.
  # Remove the CWD from sys.path while we load stuff.

minLat = -55
maxLat = 60
minLon = -180
maxLon = 180

# Helper function to query DataFrame for slice of interest
def dfQueryFunc(minLat, maxLat, minLon, maxLon, _var_names,**kwargs):
    basic = ['station', 'longitude', 'latitude']
    
    if 'Depth' in kwargs:
        sum_names = basic +_var_names
    else: 
        sum_names = basic + ['depth'] + _var_names

    print(sum_names)
    cols = ', '.join(sum_names)

    if 'Latitude' in kwargs:
        lat_target = kwargs['Latitude']
        minLat = lat_target-1.5
        maxLat = lat_target+1.5
        (lonMin, lonMax) = kwargs['lonLimits']
        # _var_df = _df3_sub[sum_names].query('Latitude >= '+str(lat_target-3)+ 'and Latitude <= '+str(lat_target+3)+ ' and Longitude >= '+str(lonMin)+ 'and Longitude <= '+str(lonMax))
    if 'Longitude' in kwargs:
        lon_target = kwargs['Longitude']
        minLon = lon_target-1.5
        maxLon = lon_target+1.5
        (latMin, latMax) = kwargs['latLimits']
        # _var_df = _df3_sub[sum_names].query('Longitude >= '+str(lon_target-3)+ 'and Longitude <= '+str(lon_target+3) +' and Latitude >= '+str(latMin)+ 'and Latitude <= '+str(latMax))
    query = "select "+ cols+ " from woa13 where latitude> {} AND latitude< {} AND longitude>{} and longitude<{};".format(str(minLat), str(maxLat), str(minLon), str(maxLon))
    
    if 'Depth' in kwargs:
        depth_target = kwargs['Depth']
        query = "select "+ cols+ " from woa13 where latitude> {} AND latitude< {} AND longitude>{} and longitude<{} and depth={};".format(str(minLat), str(maxLat), str(minLon), str(maxLon), str(depth_target))
    print(query)
    return query, sum_names#_var_df

tracers = ['salinity', 'oxygen', 'nitrate', 'temperature']
query, sum_names = dfQueryFunc(minLat, maxLat, minLon, maxLon, tracers, Depth=1500)

['station', 'longitude', 'latitude', 'salinity', 'oxygen', 'nitrate', 'temperature']
select station, longitude, latitude, salinity, oxygen, nitrate, temperature from woa13 where latitude> -55 AND latitude< 60 AND longitude>-180 and longitude<180 and depth=1500;

result = conn.execute(query)

data_array = []
for row in result:
    data_array.append(list(row))
df = pd.DataFrame(data_array)

df.columns = sum_names
df.dropna()

lat_scale = StandardScaler()
lat_scale.fit(df[['latitude']].as_matrix())
df['s_latitude'] = lat_scale.transform(df[['latitude']])

lon_scale = StandardScaler()
lon_scale.fit(df[['longitude']].as_matrix())
df['s_longitude'] = lon_scale.transform(df[['longitude']])

for tracer in tracers:
    df[tracer].round(decimals = 3)
    
    scale = StandardScaler()

    df[[tracer]] = scale.fit_transform(df[[tracer]].as_matrix())

df.describe()

	longitude	latitude	salinity	oxygen	nitrate	temperature	s_latitude	s_longitude
count	26361.000000	26361.000000	2.636100e+04	2.636100e+04	2.636100e+04	2.636100e+04	2.636100e+04	2.636100e+04
mean	-14.016217	-7.010944	-2.577911e-15	-1.983838e-16	3.795168e-16	-4.808650e-16	-1.725076e-17	-2.156345e-18
std	111.438920	30.964058	1.000019e+00	1.000019e+00	1.000019e+00	1.000019e+00	1.000019e+00	1.000019e+00
min	-179.500000	-54.500000	-2.444244e+01	-2.257720e+00	-4.594790e+00	-2.760163e+00	-1.533712e+00	-1.485001e+00
25%	-118.500000	-34.500000	-2.362516e-01	-8.329838e-01	-3.065452e-01	-4.126793e-01	-8.877899e-01	-9.376057e-01
50%	-25.500000	-9.500000	-1.524248e-01	1.408474e-01	1.524210e-01	-2.182675e-01	-8.038685e-02	-1.030520e-01
75%	83.500000	17.500000	9.648754e-02	7.312365e-01	7.339001e-01	2.275435e-01	7.916084e-01	8.750809e-01
max	179.500000	59.500000	8.061613e+00	2.396203e+00	1.977760e+00	9.611661e+00	2.148046e+00	1.736556e+00

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17, 17), facecolor='w')
_basemap, fig, ax = bm.build_map('y', 'merc', minLat, maxLat, minLon, maxLon, 'c', fig, ax, 111, 'lb')

colors = ['bo', 'ro', 'go', 'co']
for ik, tracer in enumerate(tracers):
    data = df[['s_longitude', 's_latitude', tracer]].values
    kmeans = km(n_clusters=8, random_state=0).fit(data)
    centers = kmeans.cluster_centers_
    for center in centers:
        x,y = _basemap(lon_scale.inverse_transform([center[0]]), lat_scale.inverse_transform([center[1]]))
        ax.plot(x, y, colors[ik], markersize=10, alpha = .8, label = tracer)

        
from collections import OrderedDict
import matplotlib.pyplot as plt

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(), loc = 3)

_x = df['longitude'].as_matrix()
_y = df['latitude'].as_matrix()
_feat_data= df['oxygen'].as_matrix()

x_coord,y_coord = _basemap(_x, _y)
cbar_pad = .22
# plt.tight_layout(pad=3, w_pad=4., h_pad=3.0)
ylabpad = 35
xlabpad = 25
# plt.tight_layout(pad=0.4, w_pad=1., h_pad=1.0)
            

# define grid.
xi = np.linspace(min(x_coord), max(x_coord),300)
yi = np.linspace(min(y_coord), max(y_coord),300)

# grid the data.
zi = griddata(x_coord,y_coord,_feat_data,xi,yi,interp='linear')

# contour the gridded data, plotting dots at the nonuniform data points.
CS = plt.contour(xi,yi,zi,10,linewidths=0.5,colors='k')
CS = plt.contourf(xi,yi,zi,40,cmap=plt.cm.brg, 
                vmax=2*abs(zi).max(), vmin=abs(zi).min())
# plt.legend(loc=3)

0.34735774993896484

../_images/f8034396d70f91d7a4275aa63a010e1a0b916ea915a5e3ef08e94821d3d6d6bb.png

# Returns data for specific variables for relevant spatial constraints 
# uses: dfQueryFunc
# slice type: plan, NSsection, EWsection
# kwargs: plan: depth; NSsection: lonTraj, latLImits; EWsection: latTrag, lonLimits
def amassData(minLat, maxLat, minLon, maxLon, _in_var_names, _sliceType, **kwargs):
    DB_NAME = os.environ['DB_NAME']
    DB_USER = os.environ['DB_USER']
    DB_PASS = os.environ['DB_PASS']
    DB_SERVICE = os.environ['DB_SERVICE']
    DB_PORT = os.environ['DB_PORT']
    SQLALCHEMY_DATABASE_URI = 'postgresql://{0}:{1}@{2}:{3}/{4}'.format(DB_USER, DB_PASS, DB_SERVICE, DB_PORT, DB_NAME)
    # conn, meta, url = connect('jlanders', '', 'odv_app_dev2', host='localhost')    
    conn = sqlalchemy.create_engine(SQLALCHEMY_DATABASE_URI, client_encoding='utf8')

    print(_sliceType)
    # Depth slice
    if _sliceType == 'plan':
        if 'depth' in kwargs:
            _depth = kwargs['depth']
        else:
            _depth = 1500
        

        query, cols = dfQueryFunc(minLat, maxLat, minLon, maxLon, _in_var_names, Depth = _depth)


        # psql
        t0 = time.time()
        result = conn.execute(query)
        cluster_d = defaultdict(list)
        for row in result:
            for ik, col in enumerate(cols):
                cluster_d[col].append(row[ik]) 

        _x = np.asarray(cluster_d['longitude'])
        _y = np.asarray(cluster_d['latitude'])
        t1 = time.time()
        print('psql', t1-t0)
    
        _xLab = 'Longitude (deg)'
        _yLab = 'Latitude (deg)'
        _basemap = True
        _latLon_params = None
    
    # North-South (longitudinal) slice 
    if _sliceType == 'NSsection':
        if 'lonTraj' in kwargs:
            _lonTraj = kwargs['lonTraj']
            print('_lonTraj', _lonTraj)
        else:
            _lonTraj = -15
      
        if 'latLimits' in kwargs:
            _latLims = kwargs['latLimits']
        else:
            _latLims = [-55, -35]
        
        query, cols = dfQueryFunc(minLat, maxLat, minLon, maxLon, _in_var_names, Longitude = _lonTraj, latLimits = _latLims)

        # psql
        t0 = time.time()
        result = conn.execute(query)
        cluster_d = defaultdict(list)
        for row in result:
            for ik, col in enumerate(cols):
                cluster_d[col].append(row[ik]) 

        _x = np.asarray(cluster_d['latitude'])
        _y = np.asarray(cluster_d['depth'])
        t1 = time.time()
        print('psql', t1-t0)
      
        _yLab = 'Depth (m)'  
        _xLab = 'Latitude (deg) along '+ r'%s $^\circ$' % abs(_lonTraj)+ ['W' if lon<0 else 'E' for lon in [_lonTraj]][0]
        _basemap = False
        _latLon_params = ((_lonTraj, _lonTraj), _latLims)
  
  # East-West (latitudeinal) slice
    if _sliceType == 'EWsection':
        if 'latTraj' in kwargs:
            _latTraj = kwargs['latTraj']
        else:
            _latTraj = -45
      
        if 'lonLimits' in kwargs:
            _lonLims = kwargs['lonLimits']
        else:
            _lonLims = [-10, 10]
        
        query, cols = dfQueryFunc(minLat, maxLat, minLon, maxLon, _in_var_names, Latitude = _latTraj, lonLimits = _lonLims)      

        # psql
        t0 = time.time()
        result = conn.execute(query)
        cluster_d = defaultdict(list)
        for row in result:
            for ik, col in enumerate(cols):
                cluster_d[col].append(row[ik]) 

        _x = np.asarray(cluster_d['longitude'])
        _y = np.asarray(cluster_d['depth'])
        t1 = time.time()
        print('psql', t1-t0)
      
        _yLab = 'Depth (m))'  
        _xLab = 'Longitude (deg) along '+ r'%s $^\circ$' % abs(_latTraj)+ ['S' if lat<0 else 'N' for lat in [_latTraj]][0]
        _basemap = False
        _latLon_params = (_lonLims, [_latTraj, _latTraj])
  
    # _feat_data = cluster_df[_in_var_names].as_matrix()

    _feat_data = np.zeros((len(cluster_d['station']), len(_in_var_names)))
    for ik, name in enumerate(_in_var_names):
        _feat_data[:, ik] = np.asarray(cluster_d[name])

      
    mask = np.all(np.isnan(_feat_data), axis=1)
    _x = _x[~mask]
    _y = _y[~mask]
    _feat_data = _feat_data[~mask]
    
    if len(_in_var_names) == 1: 
        _feat_data = np.ravel(_feat_data)
  
    return _x, _y, _feat_data, _basemap, _xLab, _yLab, _latLon_params

def make_section(ax, _colors, _x, _y, _latLon_params, model_raw):
    if model_raw == 'model':
        for im in range(len(_x)):
            ax.plot(_x[im], _y[im], c = _colors[im], marker = 'o', markersize=15, alpha = .1)

    ax.invert_yaxis()

    axin = inset_axes(ax, width="35%", height="35%", loc=4)
    inmap = Basemap(projection='ortho', lon_0=np.mean(_latLon_params[0]), lat_0=0,
        ax=axin, anchor='NE')
    inmap.fillcontinents()
    inmap.drawcoastlines(color='k')
    inmap.plot(_latLon_params[0], _latLon_params[1], '-k', linewidth=2 , latlon=True)  
    return ax


'''
Plots raw data either in section or plan.
If there is a basemap, x=lon, y=lat and data is from a depth slice (plan).
If not, x=lat or lon, y=depth.
Both plot types are colored by contour for a specific chemical constituent. 
'''
# def plotRaw(_x, _y, _feat_data, _basemap, _xLab, _yLab, _latLon_params, _in_var_names):

def getRaw(minLat, maxLat, minLon, maxLon, _in_var_names, _sliceType, **kwargs):
    units_dict = {'temperature': '$^\circ$C', 'oxygen': 'ml/l', 'aou': 'ml/l', 'longitude': '$^\circ$E', 'salinity': '(psu)', 'nitrate': '$\mu$mol/l', 'depth': 'm', 'phosphate': '$\mu$mol/l', 'latitude': '$^\circ$N', 'oxygen_saturation': '%'}
    if _sliceType == 'NSsection':
        kwargs['lonTraj'] = minLon
        kwargs['latLimits'] = (minLat, maxLat)
    if _sliceType == 'EWsection':
        kwargs['latTraj'] = minLat
        kwargs['lonLimits'] = (minLon, maxLon)

    _x, _y, _feat_data, _basemap, xLab, yLab, _latLon_params = amassData(minLat, maxLat, minLon, maxLon, _in_var_names, _sliceType, **kwargs)
    return _x, _y, _feat_data, _basemap, xLab, yLab, _latLon_params

def plotRaw(minLat, maxLat, minLon, maxLon, _in_var_names, _sliceType, **kwargs):
    units_dict = {'temperature': '$^\circ$C', 'oxygen': 'ml/l', 'aou': 'ml/l', 'longitude': '$^\circ$E', 'salinity': '(psu)', 'nitrate': '$\mu$mol/l', 'depth': 'm', 'phosphate': '$\mu$mol/l', 'latitude': '$^\circ$N', 'oxygen_saturation': '%'}
    if _sliceType == 'NSsection':
        kwargs['lonTraj'] = minLon
        kwargs['latLimits'] = (minLat, maxLat)
    if _sliceType == 'EWsection':
        kwargs['latTraj'] = minLat
        kwargs['lonLimits'] = (minLon, maxLon)

    _x, _y, _feat_data, _basemap, xLab, yLab, _latLon_params = amassData(minLat, maxLat, minLon, maxLon, _in_var_names, _sliceType, **kwargs)
    if len(_feat_data.shape)>1:
        print( 'Please limit feature data to one variable')
    else:
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17, 17), facecolor='w')
        if _basemap:
            _basemap, fig, ax = bm.build_map('y', 'merc', minLat, maxLat, minLon, maxLon, 'c', fig, ax, 111, 'lb')
            x_coord,y_coord = _basemap(_x, _y)
            cbar_pad = .22
            plt.tight_layout(pad=3, w_pad=4., h_pad=3.0)
            ylabpad = 35
            xlabpad = 25
            plt.tight_layout(pad=0.4, w_pad=1., h_pad=1.0)


        else:
            x_coord = _x
            y_coord = _y
            ylabpad = 0
            xlabpad = 0

        # define grid.
        xi = np.linspace(min(x_coord), max(x_coord),300)
        yi = np.linspace(min(y_coord), max(y_coord),300)

        # grid the data.
        zi = griddata(x_coord,y_coord,_feat_data,xi,yi,interp='linear')

        # contour the gridded data, plotting dots at the nonuniform data points.
        CS = plt.contour(xi,yi,zi,10,linewidths=0.5,colors='k')
        CS = plt.contourf(xi,yi,zi,15,cmap=plt.cm.rainbow, 
                        vmax=abs(zi).max(), vmin=abs(zi).min())
        
        # label axes
        ax.set_ylabel(yLab, fontsize=axis_sz-3, labelpad = ylabpad)
        ax.set_xlabel(xLab, fontsize=axis_sz-3, labelpad = xlabpad)

        # reverse axis of y is depth
        if not _basemap:
            ax = make_section(plt.gca(), None, _x,_y, _latLon_params, 'raw')  
            cbar_pad = .22
        
        # create colorbar
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="3%", pad=cbar_pad)
        cbar = plt.colorbar(CS, cax=cax)
        cbar_label = _in_var_names[0] + ' (' + units_dict[_in_var_names[0]] + ')'
        cbar.ax.set_ylabel(cbar_label, fontsize=axis_sz) 
        
        # set tick size
        xtickNames = ax.get_xticklabels()
        ytickNames = ax.get_yticklabels()
        cbartickNames = cbar.ax.get_yticklabels()

        for names in [ytickNames, xtickNames, cbartickNames]:
            plt.setp(names, rotation=0, fontsize=tick_sz-4)

        # plt.show()
        return fig