Data Science Library for python learning

numpy create array

#Import the numppy library and rename numpy to np.
import numpy as np

#1). Create an array: a, b, c create the same array, choose any one;
#Method 1: Transform the data type to ndarray by passing the array/list directly into the array method.
a = np.array([1, 2, 3, 4, 5])
 #The value passed in by the array method can be a range object
b = np.array(range(1, 6))
print(b)
li = [
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11.8, 12]
]
#When converting a data type, all objects of an array can be converted by dtype. eg:dtype=np.int to convert elements to int
li_ndarray = np.array(li, dtype=np.int)

#Method 2: Generate the specified value directly
c = np.arange(1, 6)
##The basic syntax and parameters of arange and range are consistent.Advantage of arange: Decimal intervals can be generated.
#d = np.arange(2.2, 13.3, 0.2)
#print(a, b, c)

##2). View the array type created by numpy
print(type(a))
print(type(b))
print(type(c))

#3). Look at the data types stored in arrays, what are the common data types?
print(a.dtype)   # Why int64?Because the hardware architecture is 64 bit;

#4). Formulate the data type of the created array
d = np.array([1.9, 0, 1.3, 0], dtype=np.float)
print(d, d.dtype)
#5). Modify the data type of the array
e = d.astype('int64')   # It can be either a data type or a data code; int64--i1
print(e, e.dtype)

#6). Modify the number of decimal places in a floating point number
#Create a random array of three rows and four columns.
f = np.random.random((3, 4))
print(f)

#Modify floating point values to have 3 decimal places
g = np.round(f, 3)
print(g)

Transposition of Matrix

import numpy as np

#size = (3, 4), produces a number between 0 and 1, the number of values 3*4 = 12.
#12 = 1*12 = 2*6 = 3*4 = 4*3 = 6*2 = 12*1
#data = np.random.random(size=(3, 4))
data = np.random.random((3, 4))

#Transform data structure # 2,6
data = data.reshape((2, 6))

"""
//What is transpose?
    1). Rows of the original matrix become columns of the present matrix
    2). take A All elements are mirrored around a 45 degree beam from the bottom right starting from the first row and column 1. A Conversion.

//Case: 
    # 3*2
    a = [
         [1, 2], 
         [3, 4], 
         [5, 6],    
    ]    

    #Transpose
    b = [
        1   3   5 
        2   4   6
    ]  
"""
print(data)
print(data.shape)
print("Transpose: ", data.T)
print("Transpose: ", data.transpose())
#The numpy inner axis is divided into 0 axes (x rows) and 1 axes (y columns)
print("Transpose: ", data.swapaxes(1, 0))

Index and slice of numpy

import numpy as np

#1). Generate test data
#np.arange(12)  ==> ndarray:[0 1 2 3 4 5 6 7 8 9 10 11]
#reshape((3, 4))  ===>
a = np.arange(12).reshape((3, 4))

print(a)

#*******************Single row or single column***************
#Take line 2; a[i]=>Get line i+1
print("Line 2: ", a[1])
#Take column 3; a[:, i] =>Get column i+1
print("Column 2 Contents: ",a[:, 2])
#Get data for row 2 and column 3; a[i, j] ==> Get row i+1 j+1 column
print("Contents of row 2 and column 3: ", a[1, 2])

#*******************Take consecutive rows or columns***************
#Take rows 2 and 3; a[i:j]===>Get rows i+1 through J
print(a[1:3])

#Take columns 3 and 4; a[:, i:j] ==> Get columns i+1 to j
print(a[:, 2:4])

#Rows: 1 and 2 columns: 2, get the second column in the first 2 rows
print(a[0:2, 1:2])

"""
//Requirement: Take out an array of 2*2
    1 2
    5 6 
1). Get the first 2 rows first
2). Get columns 2 and 3 from the first 2 rows      
"""
print(a[:2, 1:3])

#*******************Take discontinuous rows or columns***************
#Rows: 1 and 3 columns: all gets all the elements of the first and third rows
print(a[[0, 2], :])
#Row: all column: 1, 4
print(a[:, [0, 3]])
#Rows: 1, 3 columns: 14 Gets the elements of the first row, the first column, and the third row, the fourth column
print("*"*10)
print(a[[0, 2], [0, 3]])

Modification of numpy values

import numpy as np

#Perform row and specified column modifications
t = np.arange(24).reshape((4, 6))
print(t)

#Method 1: Evaluate elements according to index/slice.
#Row: all, column: 3,4
#Ideas for indexing: 2~4-1 columns: 3~4
#Row/Column Position: 3~4
t[:, 2:4] = 0
print(t)

#Method 2: Boolean index, if the condition/Bool=True is satisfied, assign value, otherwise, do not modify
#Returns the matrix (as with the size of t) and stores the Bool type
print(t < 10)

t[t < 10] = 100
print(t)

t[t > 20] = 200
print(t)

#Method 3: numpy's ternary operator t<100?0:10
# Satisfies the condition/Bool=True, then assigns = value1, otherwise assigns = value2
t1  = np.where(t < 100, 0, 10)
print(t)
print(t1)

Get the four corner elements of a matrix

"""
//Gets the elements of the four corners of the 4X3 array.Row indexes are [0,0] and [3,3], while column indexes are [0,2] and [0,2].

 1 2 3 4
 5 6 7 8
 9 10 11

"""

import numpy as np
def get_edge(data):
    """Gets the elements of the four corners of the array"""
    row, column = data.shape
    rows = np.array([[0, 0], [row - 1, row - 1]])
    cols = np.array([[0, column - 1], [0, column - 1]])
    return  data[rows, cols]

if __name__ == '__main__':
    x = np.arange(30).reshape((5, 6))
    print("data: ", x)
    print("result: ", get_edge(x))

#x = np.arange(12).reshape((4, 3))
#print('Our array is:')
#print(x)
##Get the four-corner element, what is its corresponding index?
##0=(0,0)  2=(0,2) 9=(3,0)  11=(3,2)

#row, column = x.shape  # row=4 column=3
##rows = np.array([[0, 0], [3, 3]])   # Obtained row information
##cols = np.array([[0, 2], [0, 2]])   # Get column information

#rows = np.array([[0, 0], [row-1, row-1]])
#cols = np.array([[0, column-1], [0, column-1]])
#y = x[rows, cols]
#print('The four corner elements of this array are:')
#print(y)

Modification of Array Shape

"""
    reshape Modify shapes without changing data
         numpy.reshape(arr, newshape, order='C')
         order: 'C' -- By line,'F' -- By column,'A' -- In the original order,'k' -- The order in which elements appear in memory.
    flat    Array Element Iterator

    flatten Returns a copy of the array, changes made to the copy will not affect the original array
    ravel   Return Expanded Array
"""

import numpy as np

print("****************************************flat********************************")
a = np.arange(9).reshape(3, 3)
print('Original array:')
for row in a:
    print(row)

##Each element in the array is processed (expanded) using the flat attribute, which is an array element iterator:
print('Iterated array:')
for element in a.flat:
    print(element)

print("*********************************flatten**************************************")
a = np.arange(8).reshape(2, 4)

print('Original array:')
print(a)
print('\n')
 //Default by line

print('Expanded array:')
print(a.flatten())
print('\n')

print('with F Style order expanded array:')
print(a.flatten(order='F'))

print("*********************************ravel*************************************")
a = np.arange(8).reshape(2, 4)

print('Original array:')
print(a)
print('\n')

print('call ravel After function:')
print(a.ravel())
print('\n')

print('with F Style Sequential Call ravel After function:')
print(a.ravel(order='F'))

Array Stitching

"""
concatenate Connect an array sequence along an existing axis
stack   Add a series of arrays along the new axis.
hstack  Arrays in a Horizontal Stacked Sequence (Column Direction)
vstack  Arrays in a vertical stacked sequence (row direction)

"""

import numpy as np

print("******************** concatenate ****************")
a = np.array([[1, 2], [3, 4]])
print('First array:')
print(a)
print('\n')

b = np.array([[5, 6], [7, 8]])
print('Second array:')
print(b)
print('\n')

##Two arrays have the same dimension
##x-axis and y-axis, 0-axis and 1-axis
#print('Connect two arrays along axis 0:')
print(np.concatenate((a, b)))
print('\n')

#print('Connect two arrays along axis 1:')
print(np.concatenate((a, b), axis=1))

 print("*************************stack*********************************")
#a = np.array([[1, 2], [3, 4]])
#print('first array:')
#print(a)
#print('\n')
#b = np.array([[5, 6], [7, 8]])

#print('second array:')
#print(b)
#print('\n')
#print('stack two arrays along axis 0:')
#stack_0 = np.stack((a, b), axis=-1)
#print(stack_0)
#print(stack_0.shape)
#print('\n')
#print('stack two arrays along axis 1:')
#print(np.stack((a, b), axis=1))

print("**************************************hstack + vstack*************************************")
a = np.array([[1, 2], [3, 4]])

print('First array:')
print(a)
print('\n')
b = np.array([[5, 6], [7, 8]])

print('Second array:')
print(b)
print('\n')

print('Horizontal stacking:')
c = np.hstack((a, b))
print(c)
print('\n')

print('Vertical stacking:')
c = np.vstack((a, b))
print(c)
print('\n')

Array Split

"""
    split   Split an array into subarrays
        numpy.split(ary, indices_or_sections, axis)
    hsplit  Split an array horizontally into subarrays (by column)
    vsplit  Divide an array vertically into subarrays (by row)
"""

import numpy as np

print("**********************split******************************")
a = np.arange(9)
print('First array:')
print(a)
print('\n')

print('Divide an array into three equally sized subarrays:')
b = np.split(a, 3)
print(b)
print('\n')

print('Split the position of the array indicated in a one-dimensional array:')
b = np.split(a, [1, 7])
print(b)

print('******************hsplit*****************')
harr = np.arange(12).reshape((3, 4))
print('primary array: ')
print(harr)

print('After horizontal splitting:')
print(np.hsplit(harr, 2))

print("***************************vsplit****************************")

a = np.arange(12).reshape(4, 3)

print('First array:')
print(a)
print('\n')

print('Vertical division:')
b = np.vsplit(a, 2)
print(b)

Addition and deletion of array elements

"""
    resize  Returns a new array of the specified shape
    append  Add value to end of array
    insert  Inserts the value along the specified axis before the specified subscript
    delete  Delete the subarray of an axis and return the deleted new array
    unique  Find the only element in the array
            arr: Input array, expands if it is not a one-dimensional array
            return_index: If is true,Returns the position (subscript) of the new list element in the old list and stores it as a list
            return_counts: If is true,Returns the number of occurrences of elements in a de-multiplication array in the original array

"""
import numpy as np

print('***************append****************')
a = np.array([[1, 2, 3], [4, 5, 6]])

print('First array:')
print(a)
print('\n')

print('Add elements to the array:')
#When no row/column append element is specified, the preceding element is expanded before appending by default
print(np.append(a, [7, 8, 9]))      # [1 2 3 4 5 6 7 8 9]
print('\n')

print('Add elements along axis 0:')
print(np.append(a, [[7, 8, 9]], axis=0))
print('\n')

print('Add elements along axis 1:')
print(np.append(a, [[5, 5, 5], [7, 8, 9]], axis=1))

print('******************************insert****************************************')
a = np.array([[1, 2], [3, 4], [5, 6]])

print('First array:')
print(a)
print('\n')

print('Undelivered Axis Parameters.The input array is expanded before insertion.')
print(np.insert(a, 3, [11, 12]))
print('\n')

print('Passed Axis Parameters.An array of values is broadcast to match the input array.')
print('Broadcast along axis 0:')
print(np.insert(a, 1, [11, 100], axis=0))
print('\n')

print('Broadcast along Axis 1:')
print(np.insert(a, 1, [11,12, 13], axis=1))

print('***********************delete******************************************')
a = np.arange(12).reshape(3, 4)

print('First array:')
print(a)
print('\n')

print('Undelivered Axis Parameters.The input array is expanded before insertion.')
print(np.delete(a, 5))
print('\n')

print('Delete the second column:')
print(np.delete(a, 1, axis=1))
print('\n')

print('Delete the second line:')
print(np.delete(a, 1, axis=0))
print('\n')

print('Slices containing alternate values deleted from the array:')
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
"""
np.s_ A better way to index tuples for arrays.When returned slice object;
//You can also use `Slice()'with some special objects to do all this, but this version is simpler because it uses the standard array index syntax.
"""
print(np.delete(a, np.s_[::2]))

print("Delete 2D Array")
data = np.arange(12).reshape((3, 4))
print("Array Elements:")
print(data)
#Row: Column: Start of 2 Columns
print(np.delete(data, np.s_[::2], axis=0))
print(np.delete(data, np.s_[::2], axis=1))

print('****************************unique**********************************************')

a = np.array([5, 2, 6, 2, 7, 5, 6, 8, 2, 9])

print('First array:')
print(a)
print('\n')

print('The de-multiplication value of the first array:')     # [2 5 6 7 8 9]
u = np.unique(a)
print(u)
print('\n')

print('Indexed array of de-multiplied arrays:')     # [a.index(2), a.index(5), ........a.index(9)]
u, indices = np.unique(a, return_index=True)
print(indices)
print('\n')

print('Returns the number of repetitions of a De-heavy element:')
u, indices = np.unique(a, return_counts=True)
print(u)
print(indices)

numpy statistical function

"""
numpy.amin() is used to calculate the minimum value of elements in an array along a specified axis.
numpy.amax() is used to calculate the maximum value of elements in an array along a specified axis.
The numpy.ptp() function calculates the difference (maximum-minimum) between the maximum and minimum values of elements in an array.
The numpy.percentile() percentile is a measure used in statistics to represent the percentage of observations smaller than this value.
The numpy.median() function calculates the median (median) of the elements in array a

#Mean value, each value has the same proportion/weight. Language: 100 Maths: 70 English: 80 (100+70+80)/3
The numpy.mean() function returns the arithmetic mean of the elements in the array.If an axis is provided, it is calculated along it.
#Weighted average, the proportion/weight of each value is different. Language: 100 (40%) Mathematics: 70 (40%) English: 80 (20%) (10040%+7040%+80*20%)/3
The numpy.average() function calculates the weighted average of the elements in the array based on the respective weights given in another array.Average()

The standard deviation of np.std() is a measure of the dispersion of the mean values of a set of data.
The standard deviation formula is as follows: STD = sqrt (((x - x.mean()) 2)/n
The variance (sample variance) in np.var() statistics is the average of the square of the difference between each sample value and the mean of the total sample value, i.e. ((x - x.mean()) 2)/n.
The standard deviation is the square root of the variance.

Statistical Analysis of # numpy Students'Height and Weight
Requirement 1:
    //Get the height of all boys, average; get the height of all girls, average; and draw a column chart to show
    1). Read from file Sex and Height
    2). Get the height of a man or a woman by name based on a Boolean index  data['gender'] == 'M'
    3). Average mean function

//Requirement 2:
    //Obtain the weight of all boys and average; Obtain the weight of all girls and average; Draw a column chart to show

"""
import numpy as np
def get_avg_height():
    fname = "doc/eg6-a-student-data.txt"
            # Define the data type of the read column
    dtype = np.dtype([('gender', '|S1'), ('height', 'f2')])
            # Reads columns 2 and 3 of the file, ignoring the first 9 rows;
    data = np.loadtxt(fname=fname, dtype=dtype, skiprows=9,
                      usecols=(1, 3))
            # print(data)
                # print(type(data))         # Loading the file through np returns the data type of ndarray
                    # print(data['gender'])     # Get gender column information
                        # print(data['height'])     # Get height column information

                            # print(data['height'][data['gender'] == b'M'].mean())
                                # print(data['height'][data['gender'] == b'F'].mean())

                            # Determine if the gender is the man's expression
    isMale = (data['gender'] == b'M')
    male_avg_height = data['height'][isMale].mean()
                # ~Represents Reverse
    female_avg_height = data['height'][~isMale].mean()
    return male_avg_height, female_avg_height
def parser_weight(weight):
    # For weight data processing, if it cannot be converted to a floating-point data type, the missing value is returned.
    try:
        return  float(weight)
    except ValueError as e:
        return  -99
def get_avg_weight():
    fname = "doc/eg6-a-student-data.txt"
                # Define the data type of the read column
    dtype = np.dtype([('gender', '|S1'), ('weight', 'f2')])
                    # Reads columns 2 and 4 of the file, ignoring the first 9 rows;
    data = np.loadtxt(fname=fname, dtype=dtype, skiprows=9,
                      usecols=(1, 4), converters={4:parser_weight})

                # Determine if gender is the average height of a man
    isMale = data['gender'] == b'M'

            # Determine if gender is the average weight of a man
    is_weight_vaild = data['weight'] > 0
    male_avg_weight = data['weight'][isMale & is_weight_vaild].mean()
    female_avg_weight = data['weight'][~isMale & is_weight_vaild].mean()
    return  male_avg_weight, female_avg_weight
if __name__ == '__main__':
    print(get_avg_height())
    print(get_avg_weight())
"""

Statistical Analysis and Application of Stock Price Based on numpy

#Acquisition of stock data information
    1. Web Crawlers
    2. Website download: Special stock website, Kaggle (website for data analysis and machine learning exchange)
    3. Use the developed API interface for stocks: tushare

The data.csv file stores stock information, with columns 4-8, the D-H column in the EXCEL table, (K line=)
They are the opening price, the highest price, the lowest price, the closing price and the volume of the stock.
Analysis Angle:
    1. Calculate Volume Weighted Average Price
    Concepts: Volume weighted average price, the English name VWAP(Volume-Weighted Average Price) is a very important classic
    Economics represents the "average" price of financial assets.
    The larger the volume of a price, the greater the weight it takes.VWAP is a weighted average calculated by weighting the volume.

    2. Calculate Maximum and Minimum: Calculate the maximum and minimum of the latest highest and lowest prices of a stock price
    3. Calculate the difference between the maximum and minimum recent price of a stock; -- (extreme)
    Calculates the difference between the minimum and maximum recent price of a stock
    4. Calculate the median closing price
    5. Calculate variance of closing price
    6. Calculate logarithmic returns, stock returns, annual and monthly fluctuations
     *** Closing price analysis is often based on stock returns.
     The stock return rate can be divided into simple return rate and logarithmic return rate.
            Simple rate of return: refers to the rate of change between two adjacent prices.diff
            Logarithmic yield: The difference between the two after all prices are logarithmic.
            #[1, 2,3 4]   ======>[-1, ]
    *** Method used: The diff function in NumPy returns an array of differences between adjacent array elements.
    Note, however, that diff returns an array with one element less than the closing price array.
    *** In investment, volatility is a measure of price movement, which can be calculated from historical price data.When calculating historical volatility, you need to use
    To the logarithmic rate of return.
        Annual volatility is the standard deviation of the logarithmic yield divided by its mean and multiplied by the square root of the transaction day, usually 252 days.
        Monthly fluctuation is the standard deviation of the logarithmic yield divided by its mean and multiplied by the square root of the trading month.Usually the transaction month is December.
    7. Get the average closing prices for the trading days of Monday, Tuesday, Wednesday, Thursday and Friday within that time range
    8. Average closing price is lowest and highest is on the day of the week

"""
import numpy as np

print("**********************************************")

#Closing price, volume
endPrice, countNum = np.loadtxt(
    fname="doc/data.csv",
    Delimiter=",", #Specifies the file delimiter;
    usecols=(6, 7),
    unpack=True)
#print(endPrice, countNum)
VWAP = np.average(endPrice, weights=countNum) #The weight of the price is proportional to the volume of the transaction
 print("1. Calculate Volume Weighted Average Price:", VWAP)

print("**********************************************")

#Top price and lowest price
highPrice, lowPrice = np.loadtxt(
    fname="doc/data.csv",
    delimiter=",",
    usecols=(4, 5),
    #Unpacks the returned data, returns several columns of information, and unpacks several data.
    unpack=True
)

print(highPrice, lowPrice)
#Maximum value of the highest price and minimum value of the lowest price
 print("2. Maximum Price:", highPrice.max()))
print("2. Minimum price:", lowPrice.min()))

print("**********************************************")
#Calculates the difference between the maximum and minimum recent price of a stock; --(extreme)
#Calculates the difference between the minimum and maximum recent price of a stock

print("3. Extremely bad recent top price:", np.ptp(highPrice)))
print("3. Extremely bad recent minimum price:", np.ptp(lowPrice)))

print("**********************************************")
#Calculate the median closing price
 print("4. Calculate the median closing price:", np.median(endPrice)))

print("**********************************************")
#Calculate variance of closing price
 print("5. Calculate closing price variance:", np.var(endPrice)))

print("**********************************************")

def get_week(date):
    ""Get Weeks from Incoming Date 28-01-2011, 0-Monday 1-""
    from datetime import datetime
    #By default, a bytes type is passed in instead of a string;
    date = date.decode('utf-8')
    """
    Monday == 0 ... Sunday == 6
    """
    return datetime.strptime(date, "%d-%m-%Y").weekday()
#Number of weeks and closing price
week, endPrice = np.loadtxt(
    fname="doc/data.csv",
    delimiter=",",
    usecols=(1, 6),
    converters={1: get_week},
    unpack=True
)
#print(week, endPrice)
allAvg = []
for weekday in range(5):
    #Judge the average closing price on Monday in turn,.... Friday
    average = endPrice[week == weekday].mean()
    allAvg.append(average)
    print("7.week%s average closing price:%s"% (weekday + 1, average))

print(allAvg)

print("**********************************************")
#[12, 23, 34, 45, 56]

print("8. Average closing price is lowest in weeks", np.argmin(allAvg) + 1)
print("8. Average closing price is highest in weeks", np.argmax(allAvg) + 1)

print("***********************************************************")
#Simple yield
simpleReturn = np.diff(endPrice)
print(simpleReturn)
#Logarithmic yield: The difference between the two after all prices are logarithmic.
logReturn = np.diff(np.log(endPrice))
print("6. Logarithmic yield:", logReturn)
#The standard deviation of an annual fluctuation rate equal to the logarithmic yield divided by its mean and multiplied by the square root of the transaction day, usually 252 days a transaction day.
annual_vol = logReturn.std() / logReturn.mean() * np.sqrt(252)
Print (annual_vol)
#Monthly fluctuations equal the standard deviation of the logarithmic yield divided by its mean and multiplied by the square root of the trading month.Usually the transaction month is December.
month_vol = logReturn.std() / logReturn.mean() * np.sqrt(12)
Print (June_vol)

pandas create series data type

"""
Pandas is a powerful toolset for analyzing structured data; it is based on Numpy, which provides high-performance matrix operations; used for data mining and analysis; and provides data cleaning capabilities.
One of the great tools: Series
    An object, similar to a one-dimensional array, consists of a set of data (various NumPy data types) and a set of associated data labels (i.e., indexes).Simple Series objects can also be generated from a single set of data.
Two sharp tools: DataFrame
    Is a tabular data structure in Pandas that contains an ordered set of columns, each of which can be of different value types (numeric, string, Boolean, etc.). The DataFrame has both row and column indexes and can be considered a dictionary made up of Series.

Common data types:
    -1-D: Series
    -2-D: DataFrame
    -3-D: Panel....
    -Four Dimensions: Panel4D.....
    - N dimension: PanelND....

Series is a one-dimensional data structure in Pandas, similar to lists in Python and Ndarray in Numpy, except that Series is one-dimensional, can store different types of data, and has a set of indexes corresponding to elements.
"""

import pandas as pd
import numpy as np
import  string

#View pandas version information
print(pd.__version__)

#**************************Create Series Object
 #1). Create Series objects from lists
 array = ["pink", "fan", "pink tape"]
#If no index is specified, the default starts at 0;
s1 = pd.Series(data=array)
print(s1)
#If no index is specified, the default starts at 0;
ss1 = pd.Series(data=array, index=['A', 'B', 'C'])
print(ss1)

#2). Create Series from numpy's object Ndarray;
n = np.random.randn(5) #Randomly create an ndarray object;
s2 = pd.Series(data=n)
print(s2)
#Modify the data type of the element;
ss2 = s2.astype(np.int)
print(ss2)

#3). Create a Series object from a dictionary; all key values of the dictionary are indexed and all value values are Series values
#{'key1':'value1', "key2":'value2'}
#Dictionary Generation
dict = {string.ascii_lowercase[i]:i for i in range(10)}
print(dict)
s3 = pd.Series(dict)
print(s3)

Series Basic Operations:

Number attribute or method description
1 axes returns a list of row axis labels.
2 dtype Returns the data type (dtype) of the object.
3 empty Returns True if the series is empty.
4 ndim returns the dimension of the underlying data, which is defined by default: 1.
5 size returns the number of elements in the underlying data.
6 values returns the series as ndarray.
7 head() returns the first n rows.
8 tail() returns the last n rows.

"""

import pandas as pd
import numpy as np
import string

array = ["pink", "fan", "pink tape"]
s1 = pd.Series(data=array)
print(s1)
print(s1.axes) # [RangeIndex(start=0, stop=3, step=1)]
print(s1.dtype)
print(s1.empty) # False
print(s1.ndim) #Dimension 1
print(s1.size) # 3
print(s1.values) #Get all value values (do not display index)

#1). Modify Series Index
print(s1.index)
s1.index = ['A', 'B', 'C']
print(s1)

#2). Series longitudinal stitching;
array = ["pink", "fan", "westos"]
#If no index is specified, the default starts at 0;
s2 = pd.Series(data=array)
s3 = s1.append(s2)
print(s3)

#3). Delete the element corresponding to the specified index;
s3 = s3.drop('C') #Delete the value corresponding to'C';
print(s3)

#4). Finds elements by specified index
print(s3['B'])
S3['B'] = np.nan # None, null, pandas data is empty or missing, np.nan
print(s3)

#5). Slicing operations - same list
print(s3[:2]) #Get the first two elements
Print (s3[: -1]) #element for inversion
print(s3[-2:]) #Show the last two elements

series operation

import numpy as np
import pandas as pd

#values = [0, 1, 2, 3, 4], index=['a', 'b', 'c', 'd', 'e']
s1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])  # s1.index=[a, b, c, d, e]   s1.value=[0 1 2 3 4]
#values = [2, 3, 4, 5, 6, 7], index=['c', 'd', 'e', 'f', 'g', 'h']
s2 = pd.Series(np.arange(2, 8), index=['c', 'd', 'e', 'f', 'g', 'h'])  # s2.index = [c,d,e,f]

print(s1)
#print(s2)
##***************** is calculated according to the corresponding index and is populated with Nan if the indexes are different;
##Addition, missing value + true value===missing value
#print(s1 + s2)
#print(s1.add(s2))

#print(s1 - s2)
#print(s1.sub(s2))

#print(s1 * s2)
#print(s1.mul(s2))

#print(s1 / s2)
#print(s1.div(s2))

#median
print(s1)
print(s1.median())

#Summation
print(s1.sum())

#max
print(s1.max())

#min
print(s1.min())

Special where method

import pandas as pd
import numpy as np
import string

#The results of where method in &********series are completely different from those in numpy;
s1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
#Determine if the value of s1 is greater than 3. If it is greater than 3, the value is unchanged. Otherwise, it is set to the missing value
print(s1.where(s1 > 3))

#Elements that are not greater than 3 in the object are assigned a value of 10; determine if the value of s1 is greater than 3, and if it is greater than 3, the value remains the same; otherwise, the value is set to 10
print(s1.where(s1 > 3, 10))

#Elements greater than 3 in the object are assigned a value of 10;
print(s1.mask(s1 > 3))
print(s1.mask(s1 > 3, 10))
"""

Create dataframe data type

Series Only row indexes, and DataFrame Objects have both row and column indexes
    //Row index, indicating different rows, horizontal index, called index,
    //Column index, indicating different columns, vertical index, called columns,

"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#Method 1: Create from a list
li = [
    [1, 2, 3, 4],
    [2, 3, 4, 5]
]

#The DataFRame object contains two indexes, row index (0 axis, axis=0) and column index (1 axis, axis=1)
d1 = pd.DataFrame(data=li, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d1)

#Method 2: Create from numpy object
#[0 1 2 3 4 5 6 7]  ====> [[0 1 2 3], [4 5 6 7]]
narr = np.arange(8).reshape(2, 4)
#The DataFRame object contains two indexes, row index (0 axis, axis=0) and column index (1 axis, axis=1)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)

#Method 3: Create by dictionary;
dict = {
    'views': [1, 2, ],
    'loves': [2, 3, ],
    'comments': [3, 4, ]

}
d3 = pd.DataFrame(data=dict, index=['Vermicelli', "Fans"])
print(d3)

#Special cases of date operations:
#pd.date_range()
dates = pd.date_range(start='1/1/2019', end='1/08/2019')
print(dates)

#Row index
dates = pd.date_range(start='today', periods=6, freq='2D' ) # periods=6 Produces 6 dates backwards from today
print(dates)

#Row Index
columns = ['A', 'B', 'C', 'D']
d4 = pd.DataFrame( np.random.randn(6, 4), index=dates, columns=columns)
print(d4)

#One-dimensional object: Create an index of every day in 2019 with a random number value;
dates = pd.date_range(start='1/1/2021', end='1/3/2021', freq='D')
print(dates)
s1 = pd.Series([1, 2, 3], index=dates)
print(s1)

dataframe Basic Properties and Overall Situation Query

"""
a)Basic Properties
    df.shape  #Number of rows, columns
    df.dtype #Column data type
    df.ndim #Data Dimension
    df.index #Row index
    df.columns #Row Index
    df.values #Object value, two-dimensional ndarray array

b)Overall situation query
    df.head(3) #Show header rows, default 5 rows
    df.tail(3) #Show the last few lines, default 5 lines
    df.info() #Overview of related information: number of rows, number of columns, index, number of non-null columns, column type, memory usage
    df.describe() #Quick comprehensive statistical results: count, mean, standard deviation, maximum, quartile, minimum, etc.

"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

narr = np.arange(8).reshape(2, 4)
#The DataFRame object contains two indexes, row index (0 axis, axis=0) and column index (1 axis, axis=1)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)

#******************************1). View basic properties***************
print(d2.shape)  # Get the number of rows and columns;
print(d2.dtypes)  # Column data type
print(d2.ndim)  # Dimensions to get data
print(d2.index)  # Row index
print(d2.columns)  # Row Index
print(d2.values, type(d2.values))  # The value of the object, a two-dimensional ndarray array;

#****************************************2). Queries on the overall status of the data*********
print(d2.head(1))  # Show header lines, default 5 lines
print(d2.tail(1))  # Show end row of header, default 5 lines

print("*" * 10)
#Preview of related information: number of rows, number of columns, column type, memory usage
print("info:", d2.info())

print("Statistics".center(50, '*'))
#Quick aggregate result: Count, Mean, Standard deviation, Minimum, 1/4 digit, Median, 3/4 digit, Maximum;
print(d2.describe())

#3). Transpose operation
print("d2: \n", d2)
#print("d2 T: \n", d2.transpose())
print("d2 T: \n", d2.T)
#print("d2 T: \n", d2.swapaxes(1, 0))

#4). Sort by column
print(d2)
#Sort by the specified column, in ascending order by default, and set ascending=False if a descending display is required;
print(d2.sort_values(by=["views", 'tranfers'], ascending=False))

#5). Slices and queries
print(d2)
print(d2[:2])  # Slices can be implemented, but not indexed;
print('1:\n', d2['views'])  # Get single column information by tag query
print('2:\n', d2.views)  # Is equivalent to the above;
print(d2[['views', 'comments']])  # Query multi-column information through labels

#6). Query by similar index;
        #       - iloc (row data acquisition from location),
        #        - loc(t index row data by tag)
            # print(d2[0])
                # print(d2)
print(d2.iloc[0])
print(d2.iloc[-1])

            # print(d2['A'])    # Report errors
print(d2)
print(d2.loc['A'])

        # 7). Change the value of pandas;
d2.loc['A'] = np.nan
print(d2)

print(d2.info())

Reading and Writing Files

import os

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#csv, excel, json........
#). Writing csv files

df = pd.DataFrame(
    {'province': ['Shaanxi', 'Shaanxi', 'Sichuan', 'Sichuan', 'Shaanxi'],
     'city': ['Xianyang', 'Baoji', 'Chengdu', 'Chengdu', 'Baoji'],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)

print(df)

filename = os.path.join('doc', 'csvFile.csv')
"""
index=True/False   Whether to store row indexes, Normally not stored
mode='w'           How files are written, Default is'w'(Empty the original file contents, Rewrite), 'a'Append
header=True/False  Whether to write header information(Row Index), Generally required
"""
#df.to_csv(filename, index=False, mode='a', header=False, sep=' ')  # index=False does not store row indexes
#print("csv file saved successfully")

##2). Read CSV file
#df2 = pd.read_csv('doc/csvFile.csv')
#print(df2)

#3). Writing Excel files
df.to_excel("doc\excelFile.xlsx", sheet_name="Provincial Statistics", index=False)
print("excel File saved successfully")

group_by for grouping and aggregation operations

"""
pandas Provides a flexible and efficient groupby Functions,
    1). It allows you to slice, slice, summarize, and so on, in a natural way.
    2). Depending on one or more keys (can be functions, arrays, or DataFrame column>Name) Split pandas Object.
    3). Calculate grouping summary statistics, such as counts, averages, standard deviations, or user-defined functions.

"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

df = pd.DataFrame(
    {'province': ['Shaanxi', 'Shaanxi', 'Sichuan', 'Sichuan', 'Shaanxi'],
     'city': ['Xianyang', 'Baoji', 'Chengdu', 'Chengdu', 'Baoji'],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)
        # Xianyang 1, Shaanxi
        #          Baoji 1

print(df)
    # Statistical analysis was performed based on the key value of a column.
grouped = df['count1'].groupby(df['province'])
print(grouped.describe())
print(grouped.median())

    # Statistical analysis of cpunt1 information based on cities;
grouped = df['count1'].groupby(df['city'])
print(grouped.max())

    # Specify multiple key values for categorical aggregation;
grouped = df['count1'].groupby([df['province'], df['city']])
print(grouped.max())
print(grouped.sum())
print(grouped.count())

    #  Hierarchical index is achieved by unstack method.
print(grouped.max().unstack())

Case: commodity data analysis case

"""
//Document description: Each column of data represents the following: order number, order quantity, commodity name, commodity detail selection, total commodity price
//Requirement 1:
    1). Read all data from a file;  How to read csv file? to_csv
    2). Get all the product names in the data; how to get them dataframe A column of information in an object? df['Column Name'], df.Column Name
    3). Sort by commodity price, descend order, how to df object order? d2.sort_values(by=["Sorted Column Name"], ascending=True)
    //Write the most expensive 20 pieces of product information into the mosthighPrice.xlsx file; how do I get the first 20 lines of DF and write them to the file? Df.head(20) df1.to_csv(xxxxxx)

//Requirement 2:
    1). Statistical Columns[item_name]Draw a column chart of the frequency of each item in the
            (Rank of the most purchased goods-Draw the first 5 records)
    2). By Column [odrder_id] Group together to find the total amount spent on each order.
    3). Draw a scatterplot of the total amount of each order and the total quantity of its goods.
"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

##Requirement 1:
##1). Read all the data from the file;
##2) Get all the names of goods in the data;
#goodsInfo = pd.read_csv('doc/chipo.csv')
##print(goodsInfo.head())
##print(goodsInfo.tail())
##print(goodsInfo.info())
##print(goodsInfo.describe())
#print("commodity name display: \n", goodsInfo['item_name'].head()))
#print("commodity name display: \n", goodsInfo.item_name.head()))

##Requirement 1:

 #3. Sort in descending order according to the price of the goods.
##Write the information of the 20 highest-priced products into the mosthighPrice.xlsx file;
##Re-assignment;
#goodsInfo.item_price = goodsInfo.item_price.str.strip('$').astype(np.float)
#highPriceData = goodsInfo.sort_values('item_price', ascending=False).head(20)
##print(highPriceData.head(5))
#filename = 'doc\mostHighPrice.xlsx'
#highPriceData.to_excel(filename)
#print("saved successfully......")

#Requirement 2:
#1). Statistics column [item_name] for the frequency of each commodity, drawing a column chart
#(Top 5 purchases - Top 5 records)
goodsInfo = pd.read_csv('doc\chipo.csv')
#new_info counts the number of occurrences of each item name; where Unnamed: 0 is the frequency of occurrences of the item we need to obtain;
newInfo = goodsInfo.groupby('item_name').count()
mostRaiseGoods = newInfo.sort_values('Unnamed: 0', ascending=False)['Unnamed: 0'].head(5)
print(mostRaiseGoods)       # Series object

#Gets the name of the commodity in the object;
x = mostRaiseGoods.index
#Number of occurrences of acquisitions;
y = mostRaiseGoods.values

#from pyecharts import Bar
#Bar = Bar (ranking of the most purchased items)
#bar.add("", x, y)
#bar.render()
#Requirement 2:
#2). Grouped by column [odrder_id], find out the total amount spent per order=======quantity, item_price.
#3). Draw a scatter chart based on the total amount of each order and the total quantity of its goods.

goodsInfo = pd.read_csv('doc/chipo.csv')
#Get Order Quantity
quantity = goodsInfo.quantity
#Get the order item_price
item_price = goodsInfo.item_price.str.strip('$').astype(np.float)
print(item_price)

#Grouped by column [odrder_id]
order_group = goodsInfo.groupby("order_id")
#Total amount per order
x = order_group.item_price.sum()
#Total quantity of goods
y = order_group.quantity.sum()

Common String Operations

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

series1= pd.Series(['$A:1', '$B:2', '$C:3', np.nan, '$cat:3'])
print(series1)

#Convert all letters to lowercase except missing values
print(series1.str.lower())

#Convert all letters to uppercase except missing values
print(series1.str.upper())

#separate
print(series1.str.split(":"))

#Remove a character from the left and right ends
print(series1.str.strip('$'))
"""

Contents of the document: Total consumption amount, tip amount, gender, smoking, date, time, week
Requirements:

  • Scatter chart between consumption amount and tip for smokers and nonsmokers respectively; #
  • Scatter plot relationship between consumption amount and tip for smoking and nonsmoking customers among women and men;
    """

Association between consumption and tips and gender and smoking

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

filename = 'doc/tips.csv'
data = pd.read_csv(filename)
#print(data.head())
#print(data.info())

    # # Scatter Chart between Smoking Customer Consumption Amount and Tip
    # smoker = data[data['smoker'] == 'Yes']          # Screening customer information for smoking
    # x_total_bill1 = smoker['total_bill']
    # y_tip1 = smoker['tip']

##Scatter chart between consumption amount and tip for non-smoking customers
#no_smoker = data[data['smoker'] != 'Yes']
##print(smoker.head())
#x_total_bill2 = no_smoker['total_bill']
#y_tip2 = no_smoker['tip']

#from pyecharts import  Scatter
#Scatter = Scatter
#Add the first data information: x and y
#scatter.add("smoking", x_total_bill1, y_tip1)
##Add the second data information: x and y
#Scatter.add (no smoking, x_total_bill2, y_tip2)
#scatter.render()

##Scatter plot relationship between consumption amount and tips of smoking and non-smoking customers among women;
is_smoker = data['smoker'] == 'Yes'
is_female = data['sex'] == 'Female'

female_smoker = data[is_female & is_smoker]
female_no_smoker = data[is_female & ~is_smoker]
male_smoker = data[~is_female & is_smoker]
male_no_smoker = data[~is_female & ~is_smoker]

#3. Scatter plotting
from pyecharts import  Scatter
scatter = Scatter("Scatter chart between consumption amount and tip")
scatter.add("Women Customers Smoking", female_smoker['total_bill'], female_smoker['tip'])
scatter.add("Non-smoking female customer", female_no_smoker['total_bill'], female_no_smoker['tip'])
scatter.add("Smoking Male Customer", male_smoker['total_bill'], male_smoker['tip'])
scatter.add("Non-smoking male customers", male_no_smoker['total_bill'], male_no_smoker['tip'])

scatter.render()

Tags: Python Excel Attribute less

Posted on Sat, 02 May 2020 19:08:51 -0700 by malam