19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
21 |
21 |
22 """Removes commented-out Python code.""" |
22 """Removes commented-out Python code.""" |
23 |
23 |
|
24 from __future__ import print_function |
|
25 from __future__ import unicode_literals |
|
26 |
24 import difflib |
27 import difflib |
25 import io |
28 import io |
26 import os |
29 import os |
27 import re |
30 import re |
28 import tokenize |
31 import tokenize |
29 |
32 |
30 __version__ = '1.0' |
33 __version__ = '2.0.0' |
31 |
34 |
32 |
35 |
33 MULTILINE_ASSIGNMENT_REGEX = re.compile(r'^\s*\w+\s*=.*[(\[{]$') |
36 class Eradicator(object): |
34 PARTIAL_DICTIONARY_REGEX = re.compile(r'^\s*[\'"]\w+[\'"]\s*:.+[,{]\s*$') |
37 """Eradicate comments.""" |
35 |
38 BRACKET_REGEX = re.compile(r'^[()\[\]{}\s]+$') |
36 |
39 CODING_COMMENT_REGEX = re.compile(r'.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)') |
37 def comment_contains_code(line, aggressive=True): |
40 DEF_STATEMENT_REGEX = re.compile(r"def .+\)[\s]+->[\s]+[a-zA-Z_][a-zA-Z0-9_]*:$") |
38 """Return True comment contains code.""" |
41 FOR_STATEMENT_REGEX = re.compile(r"for [a-zA-Z_][a-zA-Z0-9_]* in .+:$") |
39 line = line.lstrip() |
42 HASH_NUMBER = re.compile(r'#[0-9]') |
40 if not line.startswith('#'): |
43 MULTILINE_ASSIGNMENT_REGEX = re.compile(r'^\s*\w+\s*=.*[(\[{]$') |
|
44 PARTIAL_DICTIONARY_REGEX = re.compile(r'^\s*[\'"]\w+[\'"]\s*:.+[,{]\s*$') |
|
45 PRINT_RETURN_REGEX = re.compile(r'^(print|return)\b\s*') |
|
46 WITH_STATEMENT_REGEX = re.compile(r"with .+ as [a-zA-Z_][a-zA-Z0-9_]*:$") |
|
47 |
|
48 CODE_INDICATORS = ['(', ')', '[', ']', '{', '}', ':', '=', '%', |
|
49 'print', 'return', 'break', 'continue', 'import'] |
|
50 CODE_KEYWORDS = [r'elif\s+.*', 'else', 'try', 'finally', r'except\s+.*'] |
|
51 CODE_KEYWORDS_AGGR = CODE_KEYWORDS + [r'if\s+.*'] |
|
52 WHITESPACE_HASH = ' \t\v\n#' |
|
53 |
|
54 DEFAULT_WHITELIST = ( |
|
55 r'pylint', |
|
56 r'pyright', |
|
57 r'noqa', |
|
58 r'type:\s*ignore', |
|
59 r'fmt:\s*(on|off)', |
|
60 r'TODO', |
|
61 r'FIXME', |
|
62 r'XXX', |
|
63 r'~ ', |
|
64 r'- ', |
|
65 ) |
|
66 WHITELIST_REGEX = re.compile(r'|'.join(DEFAULT_WHITELIST), flags=re.IGNORECASE) |
|
67 |
|
68 def comment_contains_code(self, line, aggressive=True): |
|
69 """Return True comment contains code.""" |
|
70 line = line.lstrip() |
|
71 if not line.startswith('#'): |
|
72 return False |
|
73 |
|
74 line = line.lstrip(self.WHITESPACE_HASH).strip() |
|
75 |
|
76 # Ignore non-comment related hashes. For example, "# Issue #999". |
|
77 if self.HASH_NUMBER.search(line): |
|
78 return False |
|
79 |
|
80 # Ignore whitelisted comments |
|
81 if self.WHITELIST_REGEX.search(line): |
|
82 return False |
|
83 |
|
84 if self.CODING_COMMENT_REGEX.match(line): |
|
85 return False |
|
86 |
|
87 # Check that this is possibly code. |
|
88 for symbol in self.CODE_INDICATORS: |
|
89 if symbol in line: |
|
90 break |
|
91 else: |
|
92 return False |
|
93 |
|
94 if self.multiline_case(line, aggressive=aggressive): |
|
95 return True |
|
96 |
|
97 for symbol in self.CODE_KEYWORDS_AGGR if aggressive else self.CODE_KEYWORDS: |
|
98 if re.match(r'^\s*' + symbol + r'\s*:\s*$', line): |
|
99 return True |
|
100 |
|
101 line = self.PRINT_RETURN_REGEX.sub('', line) |
|
102 |
|
103 if self.PARTIAL_DICTIONARY_REGEX.match(line): |
|
104 return True |
|
105 |
|
106 try: |
|
107 compile(line, '<string>', 'exec') |
|
108 except (SyntaxError, TypeError, UnicodeDecodeError): |
|
109 return False |
|
110 else: |
|
111 return True |
|
112 |
|
113 |
|
114 def multiline_case(self, line, aggressive=True): |
|
115 """Return True if line is probably part of some multiline code.""" |
|
116 if aggressive: |
|
117 for ending in ')]}': |
|
118 if line.endswith(ending + ':'): |
|
119 return True |
|
120 |
|
121 if line.strip() == ending + ',': |
|
122 return True |
|
123 |
|
124 # Check whether a function/method definition with return value |
|
125 # annotation |
|
126 if self.DEF_STATEMENT_REGEX.search(line): |
|
127 return True |
|
128 |
|
129 # Check weather a with statement |
|
130 if self.WITH_STATEMENT_REGEX.search(line): |
|
131 return True |
|
132 |
|
133 # Check weather a for statement |
|
134 if self.FOR_STATEMENT_REGEX.search(line): |
|
135 return True |
|
136 |
|
137 if line.endswith('\\'): |
|
138 return True |
|
139 |
|
140 if self.MULTILINE_ASSIGNMENT_REGEX.match(line): |
|
141 return True |
|
142 |
|
143 if self.BRACKET_REGEX.match(line): |
|
144 return True |
|
145 |
41 return False |
146 return False |
42 |
147 |
43 line = line.lstrip(' \t\v\n#').strip() |
148 |
44 |
149 def commented_out_code_line_numbers(self, source, aggressive=True): |
45 # Ignore non-comment related hashes. For example, "# Issue #999". |
150 """Yield line numbers of commented-out code.""" |
46 if re.search('#[0-9]', line): |
151 sio = io.StringIO(source) |
47 return False |
152 try: |
48 |
153 for token in tokenize.generate_tokens(sio.readline): |
49 if line.startswith('pylint:'): |
154 token_type = token[0] |
50 return False |
155 start_row = token[2][0] |
51 |
156 line = token[4] |
52 if re.match(r'.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)', line): |
157 |
53 return False |
158 if (token_type == tokenize.COMMENT and |
54 |
159 line.lstrip().startswith('#') and |
55 # Check that this is possibly code. |
160 self.comment_contains_code(line, aggressive)): |
56 for symbol in list('()[]{}:=%') + ['print', 'return', 'break', 'continue', |
161 yield start_row |
57 'import']: |
162 except (tokenize.TokenError, IndentationError): |
58 if symbol in line: |
163 pass |
59 break |
164 |
60 else: |
165 |
61 return False |
166 def filter_commented_out_code(self, source, aggressive=True): |
62 |
167 """Yield code with commented out code removed.""" |
63 if multiline_case(line, aggressive=aggressive): |
168 marked_lines = list(self.commented_out_code_line_numbers(source, |
64 return True |
169 aggressive)) |
65 |
170 sio = io.StringIO(source) |
66 symbol_list = [r'elif\s+.*', 'else', 'try', |
171 previous_line = '' |
67 'finally', r'except\s+.*'] |
172 for line_number, line in enumerate(sio.readlines(), start=1): |
68 if aggressive: |
173 if (line_number not in marked_lines or |
69 symbol_list.append(r'if\s+.*') |
174 previous_line.rstrip().endswith('\\')): |
70 |
175 yield line |
71 for symbol in symbol_list: |
176 previous_line = line |
72 if re.match(r'^\s*' + symbol + r'\s*:\s*$', line): |
177 |
73 return True |
178 |
74 |
179 def fix_file(self, filename, args, standard_out): |
75 line = re.sub(r'^(print|return)\b\s*', '', line) |
180 """Run filter_commented_out_code() on file.""" |
76 |
181 encoding = self.detect_encoding(filename) |
77 if re.match(PARTIAL_DICTIONARY_REGEX, line): |
182 with self.open_with_encoding(filename, encoding=encoding) as input_file: |
78 return True |
183 source = input_file.read() |
79 |
184 |
80 try: |
185 filtered_source = ''.join(self.filter_commented_out_code(source, |
81 compile(line, '<string>', 'exec') |
186 args.aggressive)) |
82 return True |
187 |
83 except (SyntaxError, TypeError, UnicodeDecodeError): |
188 if source != filtered_source: |
84 return False |
189 if args.in_place: |
85 |
190 with self.open_with_encoding(filename, mode='w', |
86 |
191 encoding=encoding) as output_file: |
87 def multiline_case(line, aggressive=True): |
192 output_file.write(filtered_source) |
88 """Return True if line is probably part of some multiline code.""" |
193 else: |
89 if aggressive: |
194 diff = difflib.unified_diff( |
90 for ending in ')]}': |
195 source.splitlines(), |
91 if line.endswith(ending + ':'): |
196 filtered_source.splitlines(), |
92 return True |
197 'before/' + filename, |
93 |
198 'after/' + filename, |
94 if line.strip() == ending + ',': |
199 lineterm='') |
95 return True |
200 standard_out.write('\n'.join(list(diff) + [''])) |
96 |
201 return True |
97 # Check whether a function/method definition with return value |
202 |
98 # annotation |
203 |
99 if re.search(r"def .+\)[\s]+->[\s]+[a-zA-Z_][a-zA-Z0-9_]*:$", line): |
204 def open_with_encoding(self, filename, encoding, mode='r'): |
100 return True |
205 """Return opened file with a specific encoding.""" |
101 |
206 return io.open(filename, mode=mode, encoding=encoding, |
102 # Check weather a with statement |
207 newline='') # Preserve line endings |
103 if re.search(r"with .+ as [a-zA-Z_][a-zA-Z0-9_]*:$", line): |
208 |
104 return True |
209 |
105 |
210 def detect_encoding(self, filename): |
106 # Check weather a for statement |
211 """Return file encoding.""" |
107 if re.search(r"for [a-zA-Z_][a-zA-Z0-9_]* in .+:$", line): |
212 try: |
108 return True |
213 with open(filename, 'rb') as input_file: |
109 |
214 from lib2to3.pgen2 import tokenize as lib2to3_tokenize |
110 if line.endswith('\\'): |
215 encoding = lib2to3_tokenize.detect_encoding(input_file.readline)[0] |
111 return True |
216 |
112 |
217 # Check for correctness of encoding. |
113 if re.match(MULTILINE_ASSIGNMENT_REGEX, line): |
218 with self.open_with_encoding(filename, encoding) as input_file: |
114 return True |
219 input_file.read() |
115 |
220 |
116 if re.match(r'^[()\[\]{}\s]+$', line): |
221 return encoding |
117 return True |
222 except (SyntaxError, LookupError, UnicodeDecodeError): |
118 |
223 return 'latin-1' |
119 return False |
224 |
120 |
225 def update_whitelist(self, new_whitelist, extend_default=True): |
121 |
226 """Updates the whitelist.""" |
122 def commented_out_code_line_numbers(source, aggressive=True): |
227 if extend_default: |
123 """Yield line numbers of commented-out code.""" |
228 self.WHITELIST_REGEX = re.compile( |
124 sio = io.StringIO(source) |
229 r'|'.join(list(self.DEFAULT_WHITELIST) + new_whitelist), |
125 try: |
230 flags=re.IGNORECASE) |
126 for token in tokenize.generate_tokens(sio.readline): |
231 else: |
127 token_type = token[0] |
232 self.WHITELIST_REGEX = re.compile( |
128 start_row = token[2][0] |
233 r'|'.join(new_whitelist), |
129 line = token[4] |
234 flags=re.IGNORECASE) |
130 |
|
131 if (token_type == tokenize.COMMENT and |
|
132 line.lstrip().startswith('#') and |
|
133 not line.lstrip().startswith('##') and |
|
134 # modified from original file (line added) |
|
135 comment_contains_code(line, aggressive)): |
|
136 yield start_row |
|
137 except (tokenize.TokenError, IndentationError): |
|
138 pass |
|
139 |
|
140 |
|
141 def filter_commented_out_code(source, aggressive=True): |
|
142 """Yield code with commented out code removed.""" |
|
143 marked_lines = list(commented_out_code_line_numbers(source, |
|
144 aggressive)) |
|
145 sio = io.StringIO(source) |
|
146 previous_line = '' |
|
147 for line_number, line in enumerate(sio.readlines(), start=1): |
|
148 if (line_number not in marked_lines or |
|
149 previous_line.rstrip().endswith('\\')): |
|
150 yield line |
|
151 previous_line = line |
|
152 |
|
153 |
|
154 def fix_file(filename, args, standard_out): |
|
155 """Run filter_commented_out_code() on file.""" |
|
156 encoding = detect_encoding(filename) |
|
157 with open_with_encoding(filename, encoding=encoding) as input_file: |
|
158 source = input_file.read() |
|
159 |
|
160 filtered_source = ''.join(filter_commented_out_code(source, |
|
161 args.aggressive)) |
|
162 |
|
163 if source != filtered_source: |
|
164 if args.in_place: |
|
165 with open_with_encoding(filename, mode='w', |
|
166 encoding=encoding) as output_file: |
|
167 output_file.write(filtered_source) |
|
168 else: |
|
169 diff = difflib.unified_diff( |
|
170 source.splitlines(), |
|
171 filtered_source.splitlines(), |
|
172 'before/' + filename, |
|
173 'after/' + filename, |
|
174 lineterm='') |
|
175 standard_out.write('\n'.join(list(diff) + [''])) |
|
176 |
|
177 |
|
178 def open_with_encoding(filename, encoding, mode='r'): |
|
179 """Return opened file with a specific encoding.""" |
|
180 return io.open(filename, mode=mode, encoding=encoding, |
|
181 newline='') # Preserve line endings |
|
182 |
|
183 |
|
184 def detect_encoding(filename): |
|
185 """Return file encoding.""" |
|
186 try: |
|
187 with open(filename, 'rb') as input_file: |
|
188 from lib2to3.pgen2 import tokenize as lib2to3_tokenize |
|
189 encoding = lib2to3_tokenize.detect_encoding(input_file.readline)[0] |
|
190 |
|
191 # Check for correctness of encoding. |
|
192 with open_with_encoding(filename, encoding) as input_file: |
|
193 input_file.read() |
|
194 |
|
195 return encoding |
|
196 except (SyntaxError, LookupError, UnicodeDecodeError): |
|
197 return 'latin-1' |
|
198 |
235 |
199 |
236 |
200 def main(argv, standard_out, standard_error): |
237 def main(argv, standard_out, standard_error): |
201 """Main entry point.""" |
238 """Main entry point.""" |
202 import argparse |
239 import argparse |
206 parser.add_argument('-r', '--recursive', action='store_true', |
243 parser.add_argument('-r', '--recursive', action='store_true', |
207 help='drill down directories recursively') |
244 help='drill down directories recursively') |
208 parser.add_argument('-a', '--aggressive', action='store_true', |
245 parser.add_argument('-a', '--aggressive', action='store_true', |
209 help='make more aggressive changes; ' |
246 help='make more aggressive changes; ' |
210 'this may result in false positives') |
247 'this may result in false positives') |
|
248 parser.add_argument('-e', '--error', action="store_true", |
|
249 help="Exit code based on result of check") |
211 parser.add_argument('--version', action='version', |
250 parser.add_argument('--version', action='version', |
212 version='%(prog)s ' + __version__) |
251 version='%(prog)s ' + __version__) |
|
252 parser.add_argument('--whitelist', action="store", |
|
253 help=( |
|
254 'String of "#" separated comment beginnings to whitelist. ' |
|
255 'Single parts are interpreted as regex. ' |
|
256 'OVERWRITING the default whitelist: {}' |
|
257 ).format(Eradicator.DEFAULT_WHITELIST)) |
|
258 parser.add_argument('--whitelist-extend', action="store", |
|
259 help=( |
|
260 'String of "#" separated comment beginnings to whitelist ' |
|
261 'Single parts are interpreted as regex. ' |
|
262 'Overwrites --whitelist. ' |
|
263 'EXTENDING the default whitelist: {} ' |
|
264 ).format(Eradicator.DEFAULT_WHITELIST)) |
213 parser.add_argument('files', nargs='+', help='files to format') |
265 parser.add_argument('files', nargs='+', help='files to format') |
214 |
266 |
215 args = parser.parse_args(argv[1:]) |
267 args = parser.parse_args(argv[1:]) |
216 |
268 |
|
269 eradicator = Eradicator() |
|
270 |
|
271 if args.whitelist_extend: |
|
272 eradicator.update_whitelist(args.whitelist_extend.split('#'), True) |
|
273 elif args.whitelist: |
|
274 eradicator.update_whitelist(args.whitelist.split('#'), False) |
|
275 |
217 filenames = list(set(args.files)) |
276 filenames = list(set(args.files)) |
|
277 change_or_error = False |
218 while filenames: |
278 while filenames: |
219 name = filenames.pop(0) |
279 name = filenames.pop(0) |
220 if args.recursive and os.path.isdir(name): |
280 if args.recursive and os.path.isdir(name): |
221 for root, directories, children in os.walk('{}'.format(name)): |
281 for root, directories, children in os.walk('{}'.format(name)): |
222 filenames += [os.path.join(root, f) for f in children |
282 filenames += [os.path.join(root, f) for f in children |