# numpy create array

```#Import the numppy library and rename numpy to np.
import numpy as np

#1). Create an array: a, b, c create the same array, choose any one;
#Method 1: Transform the data type to ndarray by passing the array/list directly into the array method.
a = np.array([1, 2, 3, 4, 5])
#The value passed in by the array method can be a range object
b = np.array(range(1, 6))
print(b)
li = [
[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11.8, 12]
]
#When converting a data type, all objects of an array can be converted by dtype. eg:dtype=np.int to convert elements to int
li_ndarray = np.array(li, dtype=np.int)

#Method 2: Generate the specified value directly
c = np.arange(1, 6)
##The basic syntax and parameters of arange and range are consistent.Advantage of arange: Decimal intervals can be generated.
#d = np.arange(2.2, 13.3, 0.2)
#print(a, b, c)

##2). View the array type created by numpy
print(type(a))
print(type(b))
print(type(c))

#3). Look at the data types stored in arrays, what are the common data types?
print(a.dtype)   # Why int64?Because the hardware architecture is 64 bit;

#4). Formulate the data type of the created array
d = np.array([1.9, 0, 1.3, 0], dtype=np.float)
print(d, d.dtype)
#5). Modify the data type of the array
e = d.astype('int64')   # It can be either a data type or a data code; int64--i1
print(e, e.dtype)

#6). Modify the number of decimal places in a floating point number
#Create a random array of three rows and four columns.
f = np.random.random((3, 4))
print(f)

#Modify floating point values to have 3 decimal places
g = np.round(f, 3)
print(g)```

# Transposition of Matrix

```import numpy as np

#size = (3, 4), produces a number between 0 and 1, the number of values 3*4 = 12.
#12 = 1*12 = 2*6 = 3*4 = 4*3 = 6*2 = 12*1
#data = np.random.random(size=(3, 4))
data = np.random.random((3, 4))

#Transform data structure # 2,6
data = data.reshape((2, 6))

"""
//What is transpose?
1). Rows of the original matrix become columns of the present matrix
2). take A All elements are mirrored around a 45 degree beam from the bottom right starting from the first row and column 1. A Conversion.

//Case:
# 3*2
a = [
[1, 2],
[3, 4],
[5, 6],
]

#Transpose
b = [
1   3   5
2   4   6
]
"""
print(data)
print(data.shape)
print("Transpose: ", data.T)
print("Transpose: ", data.transpose())
#The numpy inner axis is divided into 0 axes (x rows) and 1 axes (y columns)
print("Transpose: ", data.swapaxes(1, 0))```

# Index and slice of numpy

```import numpy as np

#1). Generate test data
#np.arange(12)  ==> ndarray:[0 1 2 3 4 5 6 7 8 9 10 11]
#reshape((3, 4))  ===>
a = np.arange(12).reshape((3, 4))

print(a)

#*******************Single row or single column***************
#Take line 2; a[i]=>Get line i+1
print("Line 2: ", a[1])
#Take column 3; a[:, i] =>Get column i+1
print("Column 2 Contents: ",a[:, 2])
#Get data for row 2 and column 3; a[i, j] ==> Get row i+1 j+1 column
print("Contents of row 2 and column 3: ", a[1, 2])

#*******************Take consecutive rows or columns***************
#Take rows 2 and 3; a[i:j]===>Get rows i+1 through J
print(a[1:3])

#Take columns 3 and 4; a[:, i:j] ==> Get columns i+1 to j
print(a[:, 2:4])

#Rows: 1 and 2 columns: 2, get the second column in the first 2 rows
print(a[0:2, 1:2])

"""
//Requirement: Take out an array of 2*2
1 2
5 6
1). Get the first 2 rows first
2). Get columns 2 and 3 from the first 2 rows
"""
print(a[:2, 1:3])

#*******************Take discontinuous rows or columns***************
#Rows: 1 and 3 columns: all gets all the elements of the first and third rows
print(a[[0, 2], :])
#Row: all column: 1, 4
print(a[:, [0, 3]])
#Rows: 1, 3 columns: 14 Gets the elements of the first row, the first column, and the third row, the fourth column
print("*"*10)
print(a[[0, 2], [0, 3]])```

# Modification of numpy values

```import numpy as np

#Perform row and specified column modifications
t = np.arange(24).reshape((4, 6))
print(t)

#Method 1: Evaluate elements according to index/slice.
#Row: all, column: 3,4
#Ideas for indexing: 2~4-1 columns: 3~4
#Row/Column Position: 3~4
t[:, 2:4] = 0
print(t)

#Method 2: Boolean index, if the condition/Bool=True is satisfied, assign value, otherwise, do not modify
#Returns the matrix (as with the size of t) and stores the Bool type
print(t < 10)

t[t < 10] = 100
print(t)

t[t > 20] = 200
print(t)

#Method 3: numpy's ternary operator t<100?0:10
# Satisfies the condition/Bool=True, then assigns = value1, otherwise assigns = value2
t1  = np.where(t < 100, 0, 10)
print(t)
print(t1)```

# Get the four corner elements of a matrix

```"""
//Gets the elements of the four corners of the 4X3 array.Row indexes are [0,0] and [3,3], while column indexes are [0,2] and [0,2].

1 2 3 4
5 6 7 8
9 10 11

"""

import numpy as np
def get_edge(data):
"""Gets the elements of the four corners of the array"""
row, column = data.shape
rows = np.array([[0, 0], [row - 1, row - 1]])
cols = np.array([[0, column - 1], [0, column - 1]])
return  data[rows, cols]

if __name__ == '__main__':
x = np.arange(30).reshape((5, 6))
print("data: ", x)
print("result: ", get_edge(x))

#x = np.arange(12).reshape((4, 3))
#print('Our array is:')
#print(x)
##Get the four-corner element, what is its corresponding index?
##0=(0,0)  2=(0,2) 9=(3,0)  11=(3,2)

#row, column = x.shape  # row=4 column=3
##rows = np.array([[0, 0], [3, 3]])   # Obtained row information
##cols = np.array([[0, 2], [0, 2]])   # Get column information

#rows = np.array([[0, 0], [row-1, row-1]])
#cols = np.array([[0, column-1], [0, column-1]])
#y = x[rows, cols]
#print('The four corner elements of this array are:')
#print(y)```

# Modification of Array Shape

```"""
reshape Modify shapes without changing data
numpy.reshape(arr, newshape, order='C')
order: 'C' -- By line,'F' -- By column,'A' -- In the original order,'k' -- The order in which elements appear in memory.
flat    Array Element Iterator

flatten Returns a copy of the array, changes made to the copy will not affect the original array
ravel   Return Expanded Array
"""

import numpy as np

print("****************************************flat********************************")
a = np.arange(9).reshape(3, 3)
print('Original array:')
for row in a:
print(row)

##Each element in the array is processed (expanded) using the flat attribute, which is an array element iterator:
print('Iterated array:')
for element in a.flat:
print(element)

print("*********************************flatten**************************************")
a = np.arange(8).reshape(2, 4)

print('Original array:')
print(a)
print('\n')
//Default by line

print('Expanded array:')
print(a.flatten())
print('\n')

print('with F Style order expanded array:')
print(a.flatten(order='F'))

print("*********************************ravel*************************************")
a = np.arange(8).reshape(2, 4)

print('Original array:')
print(a)
print('\n')

print('call ravel After function:')
print(a.ravel())
print('\n')

print('with F Style Sequential Call ravel After function:')
print(a.ravel(order='F'))```

# Array Stitching

```"""
concatenate Connect an array sequence along an existing axis
stack   Add a series of arrays along the new axis.
hstack  Arrays in a Horizontal Stacked Sequence (Column Direction)
vstack  Arrays in a vertical stacked sequence (row direction)

"""

import numpy as np

print("******************** concatenate ****************")
a = np.array([[1, 2], [3, 4]])
print('First array:')
print(a)
print('\n')

b = np.array([[5, 6], [7, 8]])
print('Second array:')
print(b)
print('\n')

##Two arrays have the same dimension
##x-axis and y-axis, 0-axis and 1-axis
#print('Connect two arrays along axis 0:')
print(np.concatenate((a, b)))
print('\n')

#print('Connect two arrays along axis 1:')
print(np.concatenate((a, b), axis=1))

print("*************************stack*********************************")
#a = np.array([[1, 2], [3, 4]])
#print('first array:')
#print(a)
#print('\n')
#b = np.array([[5, 6], [7, 8]])

#print('second array:')
#print(b)
#print('\n')
#print('stack two arrays along axis 0:')
#stack_0 = np.stack((a, b), axis=-1)
#print(stack_0)
#print(stack_0.shape)
#print('\n')
#print('stack two arrays along axis 1:')
#print(np.stack((a, b), axis=1))

print("**************************************hstack + vstack*************************************")
a = np.array([[1, 2], [3, 4]])

print('First array:')
print(a)
print('\n')
b = np.array([[5, 6], [7, 8]])

print('Second array:')
print(b)
print('\n')

print('Horizontal stacking:')
c = np.hstack((a, b))
print(c)
print('\n')

print('Vertical stacking:')
c = np.vstack((a, b))
print(c)
print('\n')```

# Array Split

```"""
split   Split an array into subarrays
numpy.split(ary, indices_or_sections, axis)
hsplit  Split an array horizontally into subarrays (by column)
vsplit  Divide an array vertically into subarrays (by row)
"""

import numpy as np

print("**********************split******************************")
a = np.arange(9)
print('First array:')
print(a)
print('\n')

print('Divide an array into three equally sized subarrays:')
b = np.split(a, 3)
print(b)
print('\n')

print('Split the position of the array indicated in a one-dimensional array:')
b = np.split(a, [1, 7])
print(b)

print('******************hsplit*****************')
harr = np.arange(12).reshape((3, 4))
print('primary array: ')
print(harr)

print('After horizontal splitting:')
print(np.hsplit(harr, 2))

print("***************************vsplit****************************")

a = np.arange(12).reshape(4, 3)

print('First array:')
print(a)
print('\n')

print('Vertical division:')
b = np.vsplit(a, 2)
print(b)```

# Addition and deletion of array elements

```"""
resize  Returns a new array of the specified shape
append  Add value to end of array
insert  Inserts the value along the specified axis before the specified subscript
delete  Delete the subarray of an axis and return the deleted new array
unique  Find the only element in the array
arr: Input array, expands if it is not a one-dimensional array
return_index: If is true，Returns the position (subscript) of the new list element in the old list and stores it as a list
return_counts: If is true，Returns the number of occurrences of elements in a de-multiplication array in the original array

"""
import numpy as np

print('***************append****************')
a = np.array([[1, 2, 3], [4, 5, 6]])

print('First array:')
print(a)
print('\n')

#When no row/column append element is specified, the preceding element is expanded before appending by default
print(np.append(a, [7, 8, 9]))      # [1 2 3 4 5 6 7 8 9]
print('\n')

print(np.append(a, [[7, 8, 9]], axis=0))
print('\n')

print(np.append(a, [[5, 5, 5], [7, 8, 9]], axis=1))

print('******************************insert****************************************')
a = np.array([[1, 2], [3, 4], [5, 6]])

print('First array:')
print(a)
print('\n')

print('Undelivered Axis Parameters.The input array is expanded before insertion.')
print(np.insert(a, 3, [11, 12]))
print('\n')

print('Passed Axis Parameters.An array of values is broadcast to match the input array.')
print(np.insert(a, 1, [11, 100], axis=0))
print('\n')

print(np.insert(a, 1, [11,12, 13], axis=1))

print('***********************delete******************************************')
a = np.arange(12).reshape(3, 4)

print('First array:')
print(a)
print('\n')

print('Undelivered Axis Parameters.The input array is expanded before insertion.')
print(np.delete(a, 5))
print('\n')

print('Delete the second column:')
print(np.delete(a, 1, axis=1))
print('\n')

print('Delete the second line:')
print(np.delete(a, 1, axis=0))
print('\n')

print('Slices containing alternate values deleted from the array:')
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
"""
np.s_ A better way to index tuples for arrays.When returned slice object;
//You can also use `Slice()'with some special objects to do all this, but this version is simpler because it uses the standard array index syntax.
"""
print(np.delete(a, np.s_[::2]))

print("Delete 2D Array")
data = np.arange(12).reshape((3, 4))
print("Array Elements:")
print(data)
#Row: Column: Start of 2 Columns
print(np.delete(data, np.s_[::2], axis=0))
print(np.delete(data, np.s_[::2], axis=1))

print('****************************unique**********************************************')

a = np.array([5, 2, 6, 2, 7, 5, 6, 8, 2, 9])

print('First array:')
print(a)
print('\n')

print('The de-multiplication value of the first array:')     # [2 5 6 7 8 9]
u = np.unique(a)
print(u)
print('\n')

print('Indexed array of de-multiplied arrays:')     # [a.index(2), a.index(5), ........a.index(9)]
u, indices = np.unique(a, return_index=True)
print(indices)
print('\n')

print('Returns the number of repetitions of a De-heavy element:')
u, indices = np.unique(a, return_counts=True)
print(u)
print(indices)```

# numpy statistical function

"""
numpy.amin() is used to calculate the minimum value of elements in an array along a specified axis.
numpy.amax() is used to calculate the maximum value of elements in an array along a specified axis.
The numpy.ptp() function calculates the difference (maximum-minimum) between the maximum and minimum values of elements in an array.
The numpy.percentile() percentile is a measure used in statistics to represent the percentage of observations smaller than this value.
The numpy.median() function calculates the median (median) of the elements in array a

#Mean value, each value has the same proportion/weight. Language: 100 Maths: 70 English: 80 (100+70+80)/3
The numpy.mean() function returns the arithmetic mean of the elements in the array.If an axis is provided, it is calculated along it.
#Weighted average, the proportion/weight of each value is different. Language: 100 (40%) Mathematics: 70 (40%) English: 80 (20%) (10040%+7040%+80*20%)/3
The numpy.average() function calculates the weighted average of the elements in the array based on the respective weights given in another array.Average()

The standard deviation of np.std() is a measure of the dispersion of the mean values of a set of data.
The standard deviation formula is as follows: STD = sqrt (((x - x.mean()) 2)/n
The variance (sample variance) in np.var() statistics is the average of the square of the difference between each sample value and the mean of the total sample value, i.e. ((x - x.mean()) 2)/n.
The standard deviation is the square root of the variance.

`Statistical Analysis of # numpy Students'Height and Weight`
```Requirement 1:
//Get the height of all boys, average; get the height of all girls, average; and draw a column chart to show
1). Read from file Sex and Height
2). Get the height of a man or a woman by name based on a Boolean index  data['gender'] == 'M'
3). Average mean function

//Requirement 2:
//Obtain the weight of all boys and average; Obtain the weight of all girls and average; Draw a column chart to show

"""
import numpy as np
def get_avg_height():
fname = "doc/eg6-a-student-data.txt"
# Define the data type of the read column
dtype = np.dtype([('gender', '|S1'), ('height', 'f2')])
# Reads columns 2 and 3 of the file, ignoring the first 9 rows;
usecols=(1, 3))
# print(data)
# print(type(data))         # Loading the file through np returns the data type of ndarray
# print(data['gender'])     # Get gender column information
# print(data['height'])     # Get height column information

# print(data['height'][data['gender'] == b'M'].mean())
# print(data['height'][data['gender'] == b'F'].mean())

# Determine if the gender is the man's expression
isMale = (data['gender'] == b'M')
male_avg_height = data['height'][isMale].mean()
# ~Represents Reverse
female_avg_height = data['height'][~isMale].mean()
return male_avg_height, female_avg_height
def parser_weight(weight):
# For weight data processing, if it cannot be converted to a floating-point data type, the missing value is returned.
try:
return  float(weight)
except ValueError as e:
return  -99
def get_avg_weight():
fname = "doc/eg6-a-student-data.txt"
# Define the data type of the read column
dtype = np.dtype([('gender', '|S1'), ('weight', 'f2')])
# Reads columns 2 and 4 of the file, ignoring the first 9 rows;
usecols=(1, 4), converters={4:parser_weight})

# Determine if gender is the average height of a man
isMale = data['gender'] == b'M'

# Determine if gender is the average weight of a man
is_weight_vaild = data['weight'] > 0
male_avg_weight = data['weight'][isMale & is_weight_vaild].mean()
female_avg_weight = data['weight'][~isMale & is_weight_vaild].mean()
return  male_avg_weight, female_avg_weight
if __name__ == '__main__':
print(get_avg_height())
print(get_avg_weight())
"""```

# Statistical Analysis and Application of Stock Price Based on numpy

```#Acquisition of stock data information
1. Web Crawlers
2. Website download: Special stock website, Kaggle (website for data analysis and machine learning exchange)
3. Use the developed API interface for stocks: tushare

The data.csv file stores stock information, with columns 4-8, the D-H column in the EXCEL table, (K line=)
They are the opening price, the highest price, the lowest price, the closing price and the volume of the stock.
Analysis Angle:
1. Calculate Volume Weighted Average Price
Concepts: Volume weighted average price, the English name VWAP(Volume-Weighted Average Price) is a very important classic
Economics represents the "average" price of financial assets.
The larger the volume of a price, the greater the weight it takes.VWAP is a weighted average calculated by weighting the volume.

2. Calculate Maximum and Minimum: Calculate the maximum and minimum of the latest highest and lowest prices of a stock price
3. Calculate the difference between the maximum and minimum recent price of a stock; -- (extreme)
Calculates the difference between the minimum and maximum recent price of a stock
4. Calculate the median closing price
5. Calculate variance of closing price
6. Calculate logarithmic returns, stock returns, annual and monthly fluctuations
*** Closing price analysis is often based on stock returns.
The stock return rate can be divided into simple return rate and logarithmic return rate.
Simple rate of return: refers to the rate of change between two adjacent prices.diff
Logarithmic yield: The difference between the two after all prices are logarithmic.
#[1, 2,3 4]   ======>[-1, ]
*** Method used: The diff function in NumPy returns an array of differences between adjacent array elements.
Note, however, that diff returns an array with one element less than the closing price array.
*** In investment, volatility is a measure of price movement, which can be calculated from historical price data.When calculating historical volatility, you need to use
To the logarithmic rate of return.
Annual volatility is the standard deviation of the logarithmic yield divided by its mean and multiplied by the square root of the transaction day, usually 252 days.
Monthly fluctuation is the standard deviation of the logarithmic yield divided by its mean and multiplied by the square root of the trading month.Usually the transaction month is December.
7. Get the average closing prices for the trading days of Monday, Tuesday, Wednesday, Thursday and Friday within that time range
8. Average closing price is lowest and highest is on the day of the week

"""
import numpy as np

print("**********************************************")

#Closing price, volume
fname="doc/data.csv",
Delimiter=",", #Specifies the file delimiter;
usecols=(6, 7),
unpack=True)
#print(endPrice, countNum)
VWAP = np.average(endPrice, weights=countNum) #The weight of the price is proportional to the volume of the transaction
print("1. Calculate Volume Weighted Average Price:", VWAP)

print("**********************************************")

#Top price and lowest price
fname="doc/data.csv",
delimiter=",",
usecols=(4, 5),
#Unpacks the returned data, returns several columns of information, and unpacks several data.
unpack=True
)

print(highPrice, lowPrice)
#Maximum value of the highest price and minimum value of the lowest price
print("2. Maximum Price:", highPrice.max()))
print("2. Minimum price:", lowPrice.min()))

print("**********************************************")
#Calculates the difference between the maximum and minimum recent price of a stock; --(extreme)
#Calculates the difference between the minimum and maximum recent price of a stock

print("3. Extremely bad recent top price:", np.ptp(highPrice)))
print("3. Extremely bad recent minimum price:", np.ptp(lowPrice)))

print("**********************************************")
#Calculate the median closing price
print("4. Calculate the median closing price:", np.median(endPrice)))

print("**********************************************")
#Calculate variance of closing price
print("5. Calculate closing price variance:", np.var(endPrice)))

print("**********************************************")

def get_week(date):
""Get Weeks from Incoming Date 28-01-2011, 0-Monday 1-""
from datetime import datetime
#By default, a bytes type is passed in instead of a string;
date = date.decode('utf-8')
"""
Monday == 0 ... Sunday == 6
"""
return datetime.strptime(date, "%d-%m-%Y").weekday()
#Number of weeks and closing price
fname="doc/data.csv",
delimiter=",",
usecols=(1, 6),
converters={1: get_week},
unpack=True
)
#print(week, endPrice)
allAvg = []
for weekday in range(5):
#Judge the average closing price on Monday in turn,.... Friday
average = endPrice[week == weekday].mean()
allAvg.append(average)
print("7.week%s average closing price:%s"% (weekday + 1, average))

print(allAvg)

print("**********************************************")
#[12, 23, 34, 45, 56]

print("8. Average closing price is lowest in weeks", np.argmin(allAvg) + 1)
print("8. Average closing price is highest in weeks", np.argmax(allAvg) + 1)

print("***********************************************************")
#Simple yield
simpleReturn = np.diff(endPrice)
print(simpleReturn)
#Logarithmic yield: The difference between the two after all prices are logarithmic.
logReturn = np.diff(np.log(endPrice))
print("6. Logarithmic yield:", logReturn)
#The standard deviation of an annual fluctuation rate equal to the logarithmic yield divided by its mean and multiplied by the square root of the transaction day, usually 252 days a transaction day.
annual_vol = logReturn.std() / logReturn.mean() * np.sqrt(252)
Print (annual_vol)
#Monthly fluctuations equal the standard deviation of the logarithmic yield divided by its mean and multiplied by the square root of the trading month.Usually the transaction month is December.
month_vol = logReturn.std() / logReturn.mean() * np.sqrt(12)
Print (June_vol)```

# pandas create series data type

```"""
Pandas is a powerful toolset for analyzing structured data; it is based on Numpy, which provides high-performance matrix operations; used for data mining and analysis; and provides data cleaning capabilities.
One of the great tools: Series
An object, similar to a one-dimensional array, consists of a set of data (various NumPy data types) and a set of associated data labels (i.e., indexes).Simple Series objects can also be generated from a single set of data.
Two sharp tools: DataFrame
Is a tabular data structure in Pandas that contains an ordered set of columns, each of which can be of different value types (numeric, string, Boolean, etc.). The DataFrame has both row and column indexes and can be considered a dictionary made up of Series.

Common data types:
-1-D: Series
-2-D: DataFrame
-3-D: Panel....
-Four Dimensions: Panel4D.....
- N dimension: PanelND....

Series is a one-dimensional data structure in Pandas, similar to lists in Python and Ndarray in Numpy, except that Series is one-dimensional, can store different types of data, and has a set of indexes corresponding to elements.
"""

import pandas as pd
import numpy as np
import  string

#View pandas version information
print(pd.__version__)

#**************************Create Series Object
#1). Create Series objects from lists
array = ["pink", "fan", "pink tape"]
#If no index is specified, the default starts at 0;
s1 = pd.Series(data=array)
print(s1)
#If no index is specified, the default starts at 0;
ss1 = pd.Series(data=array, index=['A', 'B', 'C'])
print(ss1)

#2). Create Series from numpy's object Ndarray;
n = np.random.randn(5) #Randomly create an ndarray object;
s2 = pd.Series(data=n)
print(s2)
#Modify the data type of the element;
ss2 = s2.astype(np.int)
print(ss2)

#3). Create a Series object from a dictionary; all key values of the dictionary are indexed and all value values are Series values
#{'key1':'value1', "key2":'value2'}
#Dictionary Generation
dict = {string.ascii_lowercase[i]:i for i in range(10)}
print(dict)
s3 = pd.Series(dict)
print(s3)```

# Series Basic Operations:

Number attribute or method description
1 axes returns a list of row axis labels.
2 dtype Returns the data type (dtype) of the object.
3 empty Returns True if the series is empty.
4 ndim returns the dimension of the underlying data, which is defined by default: 1.
5 size returns the number of elements in the underlying data.
6 values returns the series as ndarray.
7 head() returns the first n rows.
8 tail() returns the last n rows.

"""

import pandas as pd
import numpy as np
import string

array = ["pink", "fan", "pink tape"]
s1 = pd.Series(data=array)
print(s1)
print(s1.axes) # [RangeIndex(start=0, stop=3, step=1)]
print(s1.dtype)
print(s1.empty) # False
print(s1.ndim) #Dimension 1
print(s1.size) # 3
print(s1.values) #Get all value values (do not display index)

#1). Modify Series Index
print(s1.index)
s1.index = ['A', 'B', 'C']
print(s1)

#2). Series longitudinal stitching;
array = ["pink", "fan", "westos"]
#If no index is specified, the default starts at 0;
s2 = pd.Series(data=array)
s3 = s1.append(s2)
print(s3)

#3). Delete the element corresponding to the specified index;
s3 = s3.drop('C') #Delete the value corresponding to'C';
print(s3)

#4). Finds elements by specified index
print(s3['B'])
S3['B'] = np.nan # None, null, pandas data is empty or missing, np.nan
print(s3)

#5). Slicing operations - same list
print(s3[:2]) #Get the first two elements
Print (s3[: -1]) #element for inversion
print(s3[-2:]) #Show the last two elements

# series operation

```import numpy as np
import pandas as pd

#values = [0, 1, 2, 3, 4], index=['a', 'b', 'c', 'd', 'e']
s1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])  # s1.index=[a, b, c, d, e]   s1.value=[0 1 2 3 4]
#values = [2, 3, 4, 5, 6, 7], index=['c', 'd', 'e', 'f', 'g', 'h']
s2 = pd.Series(np.arange(2, 8), index=['c', 'd', 'e', 'f', 'g', 'h'])  # s2.index = [c,d,e,f]

print(s1)
#print(s2)
##***************** is calculated according to the corresponding index and is populated with Nan if the indexes are different;
##Addition, missing value + true value===missing value
#print(s1 + s2)

#print(s1 - s2)
#print(s1.sub(s2))

#print(s1 * s2)
#print(s1.mul(s2))

#print(s1 / s2)
#print(s1.div(s2))

#median
print(s1)
print(s1.median())

#Summation
print(s1.sum())

#max
print(s1.max())

#min
print(s1.min())```

# Special where method

```import pandas as pd
import numpy as np
import string

#The results of where method in &********series are completely different from those in numpy;
s1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
#Determine if the value of s1 is greater than 3. If it is greater than 3, the value is unchanged. Otherwise, it is set to the missing value
print(s1.where(s1 > 3))

#Elements that are not greater than 3 in the object are assigned a value of 10; determine if the value of s1 is greater than 3, and if it is greater than 3, the value remains the same; otherwise, the value is set to 10
print(s1.where(s1 > 3, 10))

#Elements greater than 3 in the object are assigned a value of 10;
"""```

# Create dataframe data type

```Series Only row indexes, and DataFrame Objects have both row and column indexes
//Row index, indicating different rows, horizontal index, called index,
//Column index, indicating different columns, vertical index, called columns,

"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#Method 1: Create from a list
li = [
[1, 2, 3, 4],
[2, 3, 4, 5]
]

#The DataFRame object contains two indexes, row index (0 axis, axis=0) and column index (1 axis, axis=1)
d1 = pd.DataFrame(data=li, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d1)

#Method 2: Create from numpy object
#[0 1 2 3 4 5 6 7]  ====> [[0 1 2 3], [4 5 6 7]]
narr = np.arange(8).reshape(2, 4)
#The DataFRame object contains two indexes, row index (0 axis, axis=0) and column index (1 axis, axis=1)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)

#Method 3: Create by dictionary;
dict = {
'views': [1, 2, ],
'loves': [2, 3, ],

}
d3 = pd.DataFrame(data=dict, index=['Vermicelli', "Fans"])
print(d3)

#Special cases of date operations:
#pd.date_range()
dates = pd.date_range(start='1/1/2019', end='1/08/2019')
print(dates)

#Row index
dates = pd.date_range(start='today', periods=6, freq='2D' ) # periods=6 Produces 6 dates backwards from today
print(dates)

#Row Index
columns = ['A', 'B', 'C', 'D']
d4 = pd.DataFrame( np.random.randn(6, 4), index=dates, columns=columns)
print(d4)

#One-dimensional object: Create an index of every day in 2019 with a random number value;
dates = pd.date_range(start='1/1/2021', end='1/3/2021', freq='D')
print(dates)
s1 = pd.Series([1, 2, 3], index=dates)
print(s1)```

# dataframe Basic Properties and Overall Situation Query

```"""
a)Basic Properties
df.shape  #Number of rows, columns
df.dtype #Column data type
df.ndim #Data Dimension
df.index #Row index
df.columns #Row Index
df.values #Object value, two-dimensional ndarray array

b)Overall situation query
df.tail(3) #Show the last few lines, default 5 lines
df.info() #Overview of related information: number of rows, number of columns, index, number of non-null columns, column type, memory usage
df.describe() #Quick comprehensive statistical results: count, mean, standard deviation, maximum, quartile, minimum, etc.

"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

narr = np.arange(8).reshape(2, 4)
#The DataFRame object contains two indexes, row index (0 axis, axis=0) and column index (1 axis, axis=1)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)

#******************************1). View basic properties***************
print(d2.shape)  # Get the number of rows and columns;
print(d2.dtypes)  # Column data type
print(d2.ndim)  # Dimensions to get data
print(d2.index)  # Row index
print(d2.columns)  # Row Index
print(d2.values, type(d2.values))  # The value of the object, a two-dimensional ndarray array;

#****************************************2). Queries on the overall status of the data*********
print(d2.tail(1))  # Show end row of header, default 5 lines

print("*" * 10)
#Preview of related information: number of rows, number of columns, column type, memory usage
print("info:", d2.info())

print("Statistics".center(50, '*'))
#Quick aggregate result: Count, Mean, Standard deviation, Minimum, 1/4 digit, Median, 3/4 digit, Maximum;
print(d2.describe())

#3). Transpose operation
print("d2: \n", d2)
#print("d2 T: \n", d2.transpose())
print("d2 T: \n", d2.T)
#print("d2 T: \n", d2.swapaxes(1, 0))

#4). Sort by column
print(d2)
#Sort by the specified column, in ascending order by default, and set ascending=False if a descending display is required;
print(d2.sort_values(by=["views", 'tranfers'], ascending=False))

#5). Slices and queries
print(d2)
print(d2[:2])  # Slices can be implemented, but not indexed;
print('1:\n', d2['views'])  # Get single column information by tag query
print('2:\n', d2.views)  # Is equivalent to the above;
print(d2[['views', 'comments']])  # Query multi-column information through labels

#6). Query by similar index;
#       - iloc (row data acquisition from location),
#        - loc(t index row data by tag)
# print(d2[0])
# print(d2)
print(d2.iloc[0])
print(d2.iloc[-1])

print(d2)
print(d2.loc['A'])

# 7). Change the value of pandas;
d2.loc['A'] = np.nan
print(d2)

print(d2.info())```

```import os

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#csv, excel, json........
#). Writing csv files

df = pd.DataFrame(
{'province': ['Shaanxi', 'Shaanxi', 'Sichuan', 'Sichuan', 'Shaanxi'],
'city': ['Xianyang', 'Baoji', 'Chengdu', 'Chengdu', 'Baoji'],
'count1': [1, 2, 3, 4, 5],
'count2': [1, 2, 33, 4, 5]
}
)

print(df)

filename = os.path.join('doc', 'csvFile.csv')
"""
index=True/False   Whether to store row indexes, Normally not stored
mode='w'           How files are written, Default is'w'(Empty the original file contents, Rewrite), 'a'Append
"""
#df.to_csv(filename, index=False, mode='a', header=False, sep=' ')  # index=False does not store row indexes
#print("csv file saved successfully")

#print(df2)

#3). Writing Excel files
df.to_excel("doc\excelFile.xlsx", sheet_name="Provincial Statistics", index=False)
print("excel File saved successfully")```

# group_by for grouping and aggregation operations

```"""
pandas Provides a flexible and efficient groupby Functions,
1). It allows you to slice, slice, summarize, and so on, in a natural way.
2). Depending on one or more keys (can be functions, arrays, or DataFrame column>Name) Split pandas Object.
3). Calculate grouping summary statistics, such as counts, averages, standard deviations, or user-defined functions.

"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

df = pd.DataFrame(
{'province': ['Shaanxi', 'Shaanxi', 'Sichuan', 'Sichuan', 'Shaanxi'],
'city': ['Xianyang', 'Baoji', 'Chengdu', 'Chengdu', 'Baoji'],
'count1': [1, 2, 3, 4, 5],
'count2': [1, 2, 33, 4, 5]
}
)
# Xianyang 1, Shaanxi
#          Baoji 1

print(df)
# Statistical analysis was performed based on the key value of a column.
grouped = df['count1'].groupby(df['province'])
print(grouped.describe())
print(grouped.median())

# Statistical analysis of cpunt1 information based on cities;
grouped = df['count1'].groupby(df['city'])
print(grouped.max())

# Specify multiple key values for categorical aggregation;
grouped = df['count1'].groupby([df['province'], df['city']])
print(grouped.max())
print(grouped.sum())
print(grouped.count())

#  Hierarchical index is achieved by unstack method.
print(grouped.max().unstack())```

# Case: commodity data analysis case

```"""
//Document description: Each column of data represents the following: order number, order quantity, commodity name, commodity detail selection, total commodity price
//Requirement 1:
1). Read all data from a file;  How to read csv file? to_csv
2). Get all the product names in the data; how to get them dataframe A column of information in an object? df['Column Name'], df.Column Name
3). Sort by commodity price, descend order, how to df object order? d2.sort_values(by=["Sorted Column Name"], ascending=True)
//Write the most expensive 20 pieces of product information into the mosthighPrice.xlsx file; how do I get the first 20 lines of DF and write them to the file? Df.head(20) df1.to_csv(xxxxxx)

//Requirement 2:
1). Statistical Columns[item_name]Draw a column chart of the frequency of each item in the
(Rank of the most purchased goods-Draw the first 5 records)
2). By Column [odrder_id] Group together to find the total amount spent on each order.
3). Draw a scatterplot of the total amount of each order and the total quantity of its goods.
"""

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

##Requirement 1:
##1). Read all the data from the file;
##2) Get all the names of goods in the data;
##print(goodsInfo.tail())
##print(goodsInfo.info())
##print(goodsInfo.describe())

##Requirement 1:

#3. Sort in descending order according to the price of the goods.
##Write the information of the 20 highest-priced products into the mosthighPrice.xlsx file;
##Re-assignment;
#goodsInfo.item_price = goodsInfo.item_price.str.strip('\$').astype(np.float)
#filename = 'doc\mostHighPrice.xlsx'
#highPriceData.to_excel(filename)
#print("saved successfully......")

#Requirement 2:
#1). Statistics column [item_name] for the frequency of each commodity, drawing a column chart
#(Top 5 purchases - Top 5 records)
#new_info counts the number of occurrences of each item name; where Unnamed: 0 is the frequency of occurrences of the item we need to obtain;
newInfo = goodsInfo.groupby('item_name').count()
mostRaiseGoods = newInfo.sort_values('Unnamed: 0', ascending=False)['Unnamed: 0'].head(5)
print(mostRaiseGoods)       # Series object

#Gets the name of the commodity in the object;
x = mostRaiseGoods.index
#Number of occurrences of acquisitions;
y = mostRaiseGoods.values

#from pyecharts import Bar
#Bar = Bar (ranking of the most purchased items)
#bar.render()
#Requirement 2:
#2). Grouped by column [odrder_id], find out the total amount spent per order=======quantity, item_price.
#3). Draw a scatter chart based on the total amount of each order and the total quantity of its goods.

#Get Order Quantity
quantity = goodsInfo.quantity
#Get the order item_price
item_price = goodsInfo.item_price.str.strip('\$').astype(np.float)
print(item_price)

#Grouped by column [odrder_id]
order_group = goodsInfo.groupby("order_id")
#Total amount per order
x = order_group.item_price.sum()
#Total quantity of goods
y = order_group.quantity.sum()```

# Common String Operations

```import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

series1= pd.Series(['\$A:1', '\$B:2', '\$C:3', np.nan, '\$cat:3'])
print(series1)

#Convert all letters to lowercase except missing values
print(series1.str.lower())

#Convert all letters to uppercase except missing values
print(series1.str.upper())

#separate
print(series1.str.split(":"))

#Remove a character from the left and right ends
print(series1.str.strip('\$'))
"""```

Contents of the document: Total consumption amount, tip amount, gender, smoking, date, time, week
Requirements:

• Scatter chart between consumption amount and tip for smokers and nonsmokers respectively; #
• Scatter plot relationship between consumption amount and tip for smoking and nonsmoking customers among women and men;
"""

# Association between consumption and tips and gender and smoking

```import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

filename = 'doc/tips.csv'
#print(data.info())

# # Scatter Chart between Smoking Customer Consumption Amount and Tip
# smoker = data[data['smoker'] == 'Yes']          # Screening customer information for smoking
# x_total_bill1 = smoker['total_bill']
# y_tip1 = smoker['tip']

##Scatter chart between consumption amount and tip for non-smoking customers
#no_smoker = data[data['smoker'] != 'Yes']
#x_total_bill2 = no_smoker['total_bill']
#y_tip2 = no_smoker['tip']

#from pyecharts import  Scatter
#Scatter = Scatter
#Add the first data information: x and y
##Add the second data information: x and y
#scatter.render()

##Scatter plot relationship between consumption amount and tips of smoking and non-smoking customers among women;
is_smoker = data['smoker'] == 'Yes'
is_female = data['sex'] == 'Female'

female_smoker = data[is_female & is_smoker]
female_no_smoker = data[is_female & ~is_smoker]
male_smoker = data[~is_female & is_smoker]
male_no_smoker = data[~is_female & ~is_smoker]

#3. Scatter plotting
from pyecharts import  Scatter
scatter = Scatter("Scatter chart between consumption amount and tip")