1 | SELECT DATE_FORMAT ( access_time , ‘%H’ ) , count ( * ) FROM log GROUP BY DATE_FORMAT ( access_time , ‘%H’ ) ; |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | cat pd_ng_log_stat . py #!/usr/bin/env python #-*- coding: utf-8 -*- from ng_line_parser import NgLineParser import pandas as pd import socket import struct class PDNgLogStat ( object ) : def __init__ ( self ) : self . ng_line_parser = NgLineParser ( ) def _log_line_iter ( self , pathes ) : “” “解析文件中的每一行并生成一个迭代器” “” for path in pathes : with open ( path , ‘r’ ) as f : for index , line in enumerate ( f ) : self . ng_line_parser . parse ( line ) yield self . ng_line_parser . to_dict ( ) def load_data ( self , path ) : “” “通过给的文件路径加载数据生成 DataFrame” “” self . df = pd . DataFrame ( self . _log_line_iter ( path ) ) def pv_hour ( self ) : “” “计算在一天当中每个时段的访问情况” “” group_by_cols = [ ‘access_time’ ] # 需要分组的列,只计算和显示该列 # 下面我们是按 hh(小时) 形式来分组的, 所以需要定义分组策略: # 分组策略为: self.df[‘access_time’].map(lambda x: x.split().pop().split(‘:’)[0]) pv_hour_grp = self . df [ group_by_cols ] . groupby ( self . df [ ‘access_time’ ] . map ( lambda x : x . split ( ) . pop ( ) . split ( ‘:’ ) [ 0 ] ) ) return pv_hour_grp . agg ( [ ‘count’ ] ) def main ( ) : file_pathes = [ ‘www.trustauth.cn.access.log’ ] pd_ng_log_stat = PDNgLogStat ( ) pd_ng_log_stat . load_data ( file_pathes ) # 统计每小时 pv print pd_ng_log_stat . pv_hour ( ) if __name__ == ‘__main__’ : main ( ) |
运行统计和输出结果
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | python pd_ng_log_stat . py access_time count access _time 00 31539 01 34824 02 27895 03 29669 04 27742 05 26797 06 29384 07 31102 08 38257 09 43060 10 48064 11 57923 12 56413 13 57971 14 47260 15 46364 16 45721 17 48884 18 49318 19 49162 20 43641 21 42525 22 40371 23 34953 |
文章转载来自:trustauth.cn