1010import uuid
1111import logging
1212import sys
13+ import random
14+ import urllib .parse
1315
14- if len (sys .argv ) == 2 and sys .argv [1 ].startswith ("--log=" ):
15- loglevel = sys .argv [1 ][6 :]
16- else :
17- loglevel = "WARNING"
16+ loglevel = "WARNING"
17+ sample = False
18+ for arg in sys .argv :
19+ if arg .startswith ("--sample=" ):
20+ sample = int (arg [9 :])
21+ if arg .startswith ("--log=" ):
22+ loglevel = arg [6 :]
1823
1924numeric_level = getattr (logging , loglevel .upper (), None )
2025if not isinstance (numeric_level , int ):
3035CHOCTAW_HOG_PATH = os .environ ["CHOCTAW_HOG_PATH" ]
3136
3237# initialize GitHub object and list of all repos
38+ logging .info ("Trying to authenticate to Github..." )
3339g = Github (base_url = f"https://{ GHE_DOMAIN } /api/v3" , login_or_token = GHE_REPO_TOKEN , per_page = 100 )
3440repos = g .get_repos ()
41+ if sample :
42+ logging .info (f"sample size set to { sample } , retrieving list of repos..." )
43+ repos = random .sample (list (repos ), sample )
3544
3645# use the datetime library to get an object representing 48 hours ago
3746today = datetime .today ()
3847twentyfourhoursago = today - timedelta (hours = 24 )
3948
4049# start the first main set of work: translate our list of repo objects to a dict of { git_url : since_commit_hash }
4150repo_dict = {}
42-
51+ logging . info ( "Getting a list of all commits since 24 hours ago for each repo..." )
4352for repo in repos :
4453 commits = []
4554 try :
5463 logging .debug ("no SSH URL" )
5564 continue
5665 logging .info (f"({ repo .ssh_url } , { commits [- 1 ].sha } " )
57- repo_dict [repo .ssh_url ] = (commits [- 1 ].sha , f" { repo .html_url } /commit/" )
66+ repo_dict [repo .ssh_url ] = (commits [- 1 ].sha , repo .html_url )
5867
68+ logging .info ("Completed Github API requests..." )
5969repo_dict = dict (
6070 filter (lambda x : x [1 ], repo_dict .items ())
6171) # and filter out key/value pairs with None as a value
6676# git url as the key and the filename containing the results as the value
6777tempdir = tempfile .gettempdir ()
6878
69-
79+ logging . info ( "Starting choctaw hog scan of all commits over the last 24 hours..." )
7080def scan_repo (x ):
7181 filename = os .path .join (tempdir , str (uuid .uuid4 ()))
72- s = subprocess . run (
73- [
74- CHOCTAW_HOG_PATH ,
75- "--outputfile" ,
76- filename ,
77- "--since_commit" ,
78- x [ 1 ][ 0 ] ,
79- "--sshkeypath" ,
80- SSH_KEY_PATH ,
81- x [ 0 ],
82- ],
83- capture_output = True ,
84- )
82+ cmdline = [
83+ CHOCTAW_HOG_PATH ,
84+ "--outputfile" ,
85+ filename ,
86+ "--since_commit" ,
87+ x [ 1 ][ 0 ] ,
88+ "--sshkeypath" ,
89+ SSH_KEY_PATH ,
90+ x [ 0 ] ,
91+ ]
92+ logging . info ( f"Running choctaw hog: { str ( cmdline ) } " )
93+ s = subprocess . run ( cmdline , capture_output = True )
94+ logging . info ( f"choctaw hog output: { s . stdout } { s . stderr } " )
8595 return {"repo" : x [0 ], "results" : filename , "url" : x [1 ][1 ]}
8696
87-
8897output = []
8998
9099# increase this number to the number of cores you have - runs great on a c5n.4xlarge with 14
@@ -95,27 +104,42 @@ def scan_repo(x):
95104logging .debug (output )
96105
97106# the last block of work, iterate through each JSON file from choctaw_hog and put the results in Insights
107+ logging .info ("Collecting choctaw hog output into a single python list..." )
98108output_array = []
99109for result_dict in output :
100110 try :
101111 f = open (result_dict ["results" ], "r" )
102112 except :
113+ # TODO: add better error handling here. the file won't exist if we couldn't
114+ # access the git repo
115+ logging .warning ("failed to open " + result_dict ["results" ])
103116 continue
104117
105118 with f :
106119 result_list = json .load (f )
107120 for finding in result_list :
121+ fileurl = ""
122+ if finding ["new_line_num" ] != 0 :
123+ fileurl = f"{ result_dict ['url' ]} /blob/{ finding ['commitHash' ]} /{ finding ['path' ]} #L{ finding ['new_line_num' ]} "
124+ else :
125+ fileurl = f"{ result_dict ['url' ]} /blob/{ finding ['parent_commit_hash' ]} /{ finding ['path' ]} #L{ finding ['old_line_num' ]} "
108126 output_array .append (
109127 {
110128 "eventType" : "ghe_secret_monitor" ,
111129 "commitHash" : finding ["commitHash" ],
112130 "reason" : finding ["reason" ],
113131 "path" : finding ["path" ],
114132 "repo" : result_dict ["repo" ],
115- "url" : result_dict ["url" ] + finding ["commitHash" ]
133+ "url" : f"{ result_dict ['url' ]} /commit/{ finding ['commitHash' ]} /{ finding ['path' ]} " ,
134+ "fileurl" : fileurl ,
135+ "old_line_num" : finding ["old_line_num" ],
136+ "new_line_num" : finding ["new_line_num" ],
137+ "parent_commitHash" : finding ["parent_commit_hash" ]
116138 }
117139 )
118140
141+ os .remove (result_dict ["results" ])
142+
119143url = "https://insights-collector.newrelic.com/v1/accounts/{INSIGHTS_ACCT_ID}/events"
120144headers = {
121145 "Content-Type" : "application/json" ,
@@ -125,6 +149,6 @@ def scan_repo(x):
125149post = gzip .compress (json .dumps (output_array ).encode ("utf-8" ))
126150logging .info (f"len(output_array) = { len (output_array )} " )
127151logging .debug (output_array )
152+ logging .info ("Submitting data to New Relic Insights..." )
128153r = requests .post (url , data = post , headers = headers )
129154logging .info (f"insights status code: { r .status_code } " )
130-
0 commit comments