欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python的pandas库实战进行一个数据处理的工作

程序员文章站 2022-07-12 21:37:24
...

下面进行一个目标处理的步骤:将对应满足要求的数据找出来进行处理。

在Excel中完全可以进行但是为了熟悉下pandas中数据框的用法,这里就花点时间试验下;
图片的格式在下方:

主函数:

main.py

import setDF2
import re
import numpy as np
import pandas as pd 
#在data1中找出我们需要的词并输出它们的参数;准备到下次分析

def fuzzyfinder(user_input, collection):
        suggestions = []
        pattern = '.*?'.join(user_input)    # Converts 'djm' to 'd.*?j.*?m'
        regex = re.compile(pattern)         # Compiles a regex.
        for item in collection:
            match = regex.search(item)      # Checks if the current item matches the regex.
            if match:
                suggestions.append((len(match.group()), match.start(), item))
        return [x for _, _, x in sorted(suggestions)]

#去掉 “/n”
def remove_n(l):
    for i in range(len(l)):
        l[i] = l[i].split('\n')[0]
    return l

#往一个集合里面添加一个列表里面的all元素(element)
def add_all(c,s):
    for e in c:
        s.add(e)
    return s

#传递进来一个词表,返回匹配的字符串表
def returnAllword(als):
    set_kw = remove_n(open('C:\\Users\\Administrator\\Desktop\\word.txt','r+').readlines())
    s = set()   
    for string in set_kw:
        collection = fuzzyfinder(string,als)
        s = add_all(collection,s)
    al = list(s)
    return al

#对字符串进行二次处理,里面的字符串元素必须都是来自我们要求的字符
def exchange2(l):
    set_kw = remove_n(open('C:\\Users\\Administrator\\Desktop\\word.txt','r+').readlines())
    aal = []    
    s_e = set(' ')    
    for st in set_kw:
        s_e = add_all(list(st),s_e)
    for e in l:
        if(set(e) & s_e == set(e)):
            aal.append(e)
    return aal

#已知搜索词,提取数据框中的对应数据
def returnListIndex(bl):
    list_all = data1.搜索词
    list_index = []    
    for i in range(len(data1)):
        if(list_all[i] in bl):
            list_index.append(str(i))
    return list_index

'''step1: 500关键词中寻找搜索词对应的搜索词和我们对应的词条有关的词'''
file = 'F:\\By\\August\\160816\\热搜探究\\0816_ws1.csv'
data1 = setDF2.setDF2(file)

bl = exchange2(returnAllword(data1.搜索词))
list_index = returnListIndex(bl)
da = np.array(bl)
da.shape = len(da),1
df = pd.DataFrame(da,index = da,columns = ['条件词'])

data2 = pd.DataFrame(data1,index = list_index)

''' step2:选取商城点击率较高 且 搜索人气>200的椅子//点击率'''
re_index = []
for i in np.arange(1,len(data2)):
    swap = pd.DataFrame(data1,index = [data1.index[i]])
    if((float(swap.搜索人气)> 200) & (float(swap.商城点击占比) > 0.40) & (float(swap.直通车参考价) < 2.57)):
        re_index.append(str(i))
    else:
        pass
ddv = pd.DataFrame(data1,index = re_index)
print (ddv)               #print()满足条件的所有df中的关键词

'''step3:将目标写出到本地'''
ddv.to_csv('C:\\Users\\Administrator\\Desktop\\result_word.csv')

辅助函数setDF2.py

#等同于pandas.read_csv
import pandas as pd  
import numpy as np  
  
def strToD(x):  
    str1 = x.split('\n')[0]  
    return str1  
   
def setDF2(file):     
    strings = open(file,'r+').readlines()  
    open(file,'r+').close()  
    names = [];  
    data = []  
    columes = [];  
    for string1 in strings[1:len(strings)]:  
        hang = string1.split(',')  
        for element in np.arange(0,len(hang)):  
            hang[element] = strToD(hang[element])  
        if(string1 == strings[1]):  
            columes = string1.split(',')[1:len(string1)]
            columes[len(columes) - 1] =  strToD(columes[len(columes) - 1])
        else:  
            data.extend(hang[1:len(hang)])  
            names.append(hang[0])
    dd = np.array(data)  
    dd.shape = len(names),len(columes)  
    df = pd.DataFrame(dd,names,columes)  
    
    return df  
ps:那个桌面文档的TXT就是根据特征选的关键字了;;