Home » Python » python – Pyspark unit testing-Exceptionshub

python – Pyspark unit testing-Exceptionshub

Posted by: admin February 24, 2020 Leave a comment

Questions:
import unittest
import warnings
from datetime import datetime

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, TimestampType, FloatType

from ohlcv_service.ohlc_gwa import datetime_col


class ReusedPySparkTestCase(unittest.TestCase):
    sc_values = {}

    @classmethod
    def setUpClass(cls):
        conf = (SparkConf().setMaster('local[2]')
                .setAppName(cls.__name__)
                .set('deploy.authenticate.secret', '111111'))
        cls.sc = SparkContext(conf=conf)
        cls.sc_values[cls.__name__] = cls.sc
        cls.spark = (SparkSession.builder
                     .master('local[2]')
                     .appName('local-testing-pyspark-context')
                     .getOrCreate())

    @classmethod
    def tearDownClass(cls):
        print('....calling stop tearDownClass, the content of sc_values=', cls.sc_values, '\n')
        for key, sc in cls.sc_values.items():
            print('....closing=', key, '\n')
            sc.stop()

        cls.sc_values.clear()


class TestDateTimeCol(ReusedPySparkTestCase):

    def setUp(self):
        # Ignore ResourceWarning: unclosed socket.socket!
        warnings.simplefilter("ignore", ResourceWarning)

    def test_datetime_col(self):
        test_data_frame = self.create_data_frame(rows=[['GWA',
                                                        '2b600c2a-782f-4ccc-a675-bbbd7d91fde4',
                                                        '02fb81fa-91cf-4eab-a07e-0df3c107fbf8',
                                                        '2019-06-01T00:00:00.000Z',
                                                        0.001243179008694,
                                                        0.001243179008694,
                                                        0.001243179008694,
                                                        0.001243179008694,
                                                        0.001243179008694]],
                                                 columns=[StructField('indexType', StringType(), False),
                                                          StructField('id', StringType(), False),
                                                          StructField('indexId', StringType(), False),
                                                          StructField('timestamp', StringType(), False),
                                                          StructField('price', FloatType(), False),
                                                          StructField('open', FloatType(), False),
                                                          StructField('high', FloatType(), False),
                                                          StructField('low', FloatType(), False),
                                                          StructField('close', FloatType(), False)])
        expected = self.create_data_frame(rows=[['GWA',
                                                 '2b600c2a-782f-4ccc-a675-bbbd7d91fde4',
                                                 '02fb81fa-91cf-4eab-a07e-0df3c107fbf8',
                                                 '2019-06-01T00:00:00.000Z',
                                                 '1559347200',
                                                 0.001243179008694,
                                                 0.001243179008694,
                                                 0.001243179008694,
                                                 0.001243179008694,
                                                 0.001243179008694]],
                                          columns=[StructField('indexType', StringType(), False),
                                                   StructField('id', StringType(), False),
                                                   StructField('indexId', StringType(), False),
                                                   StructField('timestamp', StringType(), False),
                                                   StructField('datetime', TimestampType(), True),
                                                   StructField('price', FloatType(), False),
                                                   StructField('open', FloatType(), False),
                                                   StructField('high', FloatType(), False),
                                                   StructField('low', FloatType(), False),
                                                   StructField('close', FloatType(), False)])
        print(expected)
        convert_to_datetime = datetime_col(test_data_frame)
        self.assertEqual(expected, convert_to_datetime)

    def create_data_frame(self, rows, columns):
        rdd = self.sc.parallelize(rows)
        df = self.spark.createDataFrame(rdd.collect(), test_schema(columns=columns))
        return df


def test_schema(columns):
    return StructType(columns)


if __name__ == '__main__':
    unittest.main()

Error

TimestampType can not accept object '1559347200' in type <class 'str'>

datetime_col Function

def datetime_col(df):
      return df.select("indexType", "id", "indexId", "timestamp",
                     (F.col("timestamp").cast(TimestampType)).alias("datetime"),
                     "price", "open", "high", "low", "close")

The datetime col functions converts the timestamp from string to timestamp format. This work properly in EMR-Zeppelin notebook, but when I try to unit test this it throws the above error. The spark and pyspark version in my local is 2.3.1. How to resolve this error. When I try to convert spark df to pandas df it converts the timestamp as +12.

How to&Answers:

I cannot really reproduce your problem in your EMR setup, you don’t post a lot of info and I would not be able to set it up anyway. But there’s a few problems with your test case that I can try and help you with.

The error message you see happens because you cannot cast a string or more properly an int directly to Timestamp. You need to use to_unixtime. Something like this works fine.

expected = self.create_data_frame(rows=[['GWA',
                                         '2b600c2a-782f-4ccc-a675-bbbd7d91fde4',
                                         '02fb81fa-91cf-4eab-a07e-0df3c107fbf8',
                                         '2019-06-01T00:00:00.000Z',
                                         None,
                                         0.001243179008694,
                                         0.001243179008694,
                                         0.001243179008694,
                                         0.001243179008694,
                                         0.001243179008694]],
                                  columns=[StructField('indexType', StringType(), False),
                                           StructField('id', StringType(), False),
                                           StructField('indexId', StringType(), False),
                                           StructField('timestamp', StringType(), False),
                                           StructField('datetime', TimestampType(), True),
                                           StructField('price', FloatType(), False),
                                           StructField('open', FloatType(), False),
                                           StructField('high', FloatType(), False),
                                           StructField('low', FloatType(), False),
                                           StructField('close', FloatType(), False)])
expected = expected.withColumn('datetime', from_unixtime(F.lit(1559347200)).cast(TimestampType()))

Second problem is that your datetime_col function might work fine in the cluster (as I say I cannot really reproduce) but it does not work locally. The following way would certainly work in both.

def datetime_col(df):
    return df.select("indexType", "id", "indexId", "timestamp",
                     (to_timestamp(F.col("timestamp"), 
                                   "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")).alias("datetime"),
                     "price", "open", "high", "low", "close")

You need to set your timezone for everything to work fine (@ your setupClass).

cls.spark.conf.set("spark.sql.session.timeZone", "UTC")

And finally in your assert you have to collect the data so that you compare the content of your dataframes.

self.assertEqual(expected.collect(), convert_to_datetime.collect())

Hope it helps.