C minifier with Clang

I recently revamped Competitive programming in Nim. In short, I can create a C amalgamation from a Nim program and submit the C source code to various competitive programming websites.

Then I use a Clang based tool to shorten the C source code. It does two things:

  • Shorten function, variables, and type names
  • Use the clangFormat library to remove some whitespace

For the first step, the tool uses a derived ASTFrontendAction to traverse the AST twice, one for collecting function/var/type names and the other for renaming. Building clang::CompilerInstance from command lines needs some boilerplate. An alternative is to use clang::tooling::CommonOptionsParser and clang::tooling::ClangTool.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/*
* MiniASTConsumer collects identifiers in `used` and rename candidates (in the main file) in `d2name`.
* MiniASTConsumer iterates over `d2name` and assigns new names.
* Renamer creates clang::tooling::Replacement instances.
* HandleTranslationUnit calls clang::tooling::applyAllReplacements.
*/

#include <clang/AST/ASTConsumer.h>
#include <clang/AST/Decl.h>
#include <clang/AST/RecursiveASTVisitor.h>
#include <clang/Basic/FileManager.h>
#include <clang/Basic/LangOptions.h>
#include <clang/Basic/SourceManager.h>
#include <clang/Basic/TargetInfo.h>
#include <clang/Driver/Action.h>
#include <clang/Driver/Compilation.h>
#include <clang/Driver/Driver.h>
#include <clang/Driver/Tool.h>
#include <clang/Format/Format.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/FrontendAction.h>
#include <clang/Lex/Lexer.h>
#include <clang/Lex/PreprocessorOptions.h>
#include <clang/Tooling/Core/Replacement.h>
#include <llvm/ADT/CachedHashString.h>
#include <llvm/ADT/DenseSet.h>
#include <llvm/ADT/MapVector.h>
#include <llvm/ADT/STLExtras.h>
#include <llvm/Support/Host.h>
#include <llvm/Support/Path.h>
#include <llvm/Support/raw_ostream.h>

#include <memory>
#include <vector>

#include <assert.h>
#include <err.h>
#include <unistd.h>

using namespace clang;
using namespace llvm;

namespace {
std::unique_ptr<CompilerInvocation> buildCompilerInvocation(ArrayRef<const char *> args) {
IntrusiveRefCntPtr<DiagnosticsEngine> diags(
CompilerInstance::createDiagnostics(new DiagnosticOptions, new IgnoringDiagConsumer, true));

driver::Driver d(args[0], llvm::sys::getDefaultTargetTriple(), *diags, "cminify", llvm::vfs::getRealFileSystem());
d.setCheckInputsExist(false);
std::unique_ptr<driver::Compilation> comp(d.BuildCompilation(args));
if (!comp)
return nullptr;
const driver::JobList &jobs = comp->getJobs();
if (jobs.size() != 1 || !isa<driver::Command>(*jobs.begin()))
return nullptr;

const driver::Command &cmd = cast<driver::Command>(*jobs.begin());
if (StringRef(cmd.getCreator().getName()) != "clang")
return nullptr;
const llvm::opt::ArgStringList &cc_args = cmd.getArguments();
auto ci = std::make_unique<CompilerInvocation>();
if (!CompilerInvocation::CreateFromArgs(*ci, cc_args, *diags))
return nullptr;

ci->getDiagnosticOpts().IgnoreWarnings = true;
ci->getFrontendOpts().DisableFree = false;
return ci;
}

SmallVector<StringRef, 0> ignores;
MapVector<Decl *, std::string> d2name;
DenseSet<CachedHashStringRef> used;
std::string newCode;

struct Collector : RecursiveASTVisitor<Collector> {
SourceManager &sm;

Collector(ASTContext &ctx) : sm(ctx.getSourceManager()) {}
bool VisitFunctionDecl(FunctionDecl *fd) {
if (fd->isOverloadedOperator() || !fd->getIdentifier())
return true;
used.insert(CachedHashStringRef(fd->getName()));
if (!fd->isDefined())
return true;
std::string name = fd->getNameAsString();
if (sm.isWrittenInMainFile(fd->getLocation())) {
if (!is_contained(ignores, name))
d2name[fd->getCanonicalDecl()] = "_f";
for (ParmVarDecl *param : fd->parameters())
VisitVarDecl(param);
}
return true;
}
bool VisitVarDecl(VarDecl *vd) {
if (!vd->getIdentifier())
return true;
used.insert(CachedHashStringRef(vd->getName()));
auto kind = vd->isThisDeclarationADefinition();
if (kind != VarDecl::Definition || !sm.isWrittenInMainFile(vd->getLocation()))
return true;
d2name[vd->getCanonicalDecl()] = "_v";
return true;
}

bool VisitTagDecl(TagDecl *td) {
used.insert(CachedHashStringRef(td->getName()));
if (!td->isThisDeclarationADefinition() || !sm.isWrittenInMainFile(td->getLocation()))
return true;
d2name[td->getCanonicalDecl()] = "_t";
return true;
}
bool VisitTypedefNameDecl(TypedefNameDecl *d) {
if (d->isTransparentTag() || !sm.isWrittenInMainFile(d->getLocation()))
return true;
d2name[d->getCanonicalDecl()] = "_t";
return true;
}
};

struct Renamer : RecursiveASTVisitor<Renamer> {
SourceManager &sm;
tooling::Replacements &reps;

Renamer(ASTContext &ctx, tooling::Replacements &reps) : sm(ctx.getSourceManager()), reps(reps) {}
void replace(CharSourceRange csr, StringRef newText) { cantFail(reps.add(tooling::Replacement(sm, csr, newText))); }

bool VisitFunctionDecl(FunctionDecl *fd) {
auto *canon = fd->getCanonicalDecl();
auto it = d2name.find(canon);
if (it != d2name.end())
replace(CharSourceRange::getTokenRange(fd->getLocation()), it->second);
return true;
}
bool VisitVarDecl(VarDecl *vd) {
auto *canon = vd->getCanonicalDecl();
auto it = d2name.find(canon);
if (it != d2name.end())
replace(CharSourceRange::getTokenRange(vd->getLocation()), it->second);
return true;
}
bool VisitDeclRefExpr(DeclRefExpr *dre) {
Decl *d = dre->getDecl();
if (!(isa<FunctionDecl>(d) || isa<VarDecl>(d)))
return true;
auto it = d2name.find(d->getCanonicalDecl());
if (it != d2name.end())
replace(CharSourceRange::getTokenRange(SourceRange(dre->getBeginLoc(), dre->getEndLoc())), it->second);
return true;
}

bool VisitTagDecl(TagDecl *d) {
auto *canon = d->getCanonicalDecl();
if (d->getTypedefNameForAnonDecl())
return true;
if (auto it = d2name.find(canon); it != d2name.end())
replace(CharSourceRange::getTokenRange(d->getLocation()), it->second);
return true;
}
bool VisitTagTypeLoc(TagTypeLoc tl) {
TagDecl *td = tl.getDecl()->getCanonicalDecl();
if (td->getTypedefNameForAnonDecl())
return true;
if (auto it = d2name.find(td); it != d2name.end())
replace(CharSourceRange::getTokenRange(tl.getNameLoc()), it->second);
return true;
}
bool VisitTypedefNameDecl(TypedefNameDecl *d) {
if (auto it = d2name.find(d->getCanonicalDecl()); it != d2name.end())
replace(CharSourceRange::getTokenRange(d->getLocation()), it->second);
return true;
}
bool VisitTypedefTypeLoc(TypedefTypeLoc tl) {
TypedefNameDecl *td = tl.getTypedefNameDecl();
if (auto it = d2name.find(td); it != d2name.end())
replace(CharSourceRange::getTokenRange(tl.getNameLoc()), it->second);
return true;
}
};

struct MiniASTConsumer : ASTConsumer {
ASTContext *ctx;
int n_fn = 0, n_var = 0, n_type = 0;

void Initialize(ASTContext &ctx) override { this->ctx = &ctx; }
static std::string getName(StringRef prefix, int &id) {
static const char digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
std::string newName;
for (;;) {
newName = std::string(1, prefix[id % prefix.size()]);
if (int i = id / prefix.size())
while (newName += digits[i % 62], i /= 62);
id++;
if (!used.contains(CachedHashStringRef(newName))) break;
}
return newName;
}
bool HandleTopLevelDecl(DeclGroupRef dgr) override {
for (auto s : {"j0", "j1", "jn", "j0f", "j1f", "jnf", "j0l", "j1l", "jnl"})
used.insert(CachedHashStringRef(s));
for (auto s : {"y0", "y1", "yn", "y0f", "y1f", "ynf", "y0l", "y1l", "ynl"})
used.insert(CachedHashStringRef(s));

Collector c(*ctx);
for (Decl *d : dgr)
c.TraverseDecl(d);
for (auto &[d, name] : d2name) {
if (name == "_f")
name = getName("abcdefghijklm", n_fn);
else if (name == "_v") {
int old_n_var = n_var;
auto newName = getName("nopqrstuvwxyz", n_var);
if (newName.size() < static_cast<VarDecl *>(d)->getName().size())
name = newName;
else {
name = static_cast<VarDecl *>(d)->getName();
n_var = old_n_var;
}
} else if (name == "_t")
name = getName("ABCDEFGHIJKLMNOPQRSTUVWXYZ", n_type);
}
return true;
}
void HandleTranslationUnit(ASTContext &ctx) override {
tooling::Replacements reps;
Renamer c(ctx, reps);
c.TraverseDecl(ctx.getTranslationUnitDecl());

auto &sm = ctx.getSourceManager();
StringRef code = sm.getBufferData(sm.getMainFileID());
auto res = tooling::applyAllReplacements(code, reps);
if (!res)
errx(2, "failed to apply replacements: %s", toString(res.takeError()).c_str());
newCode = *res;
}
};

struct MiniAction : ASTFrontendAction {
std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &ci,
StringRef inFile) override {
return std::make_unique<MiniASTConsumer>();
}
};

void reformat() {
auto buf = MemoryBuffer::getMemBuffer(newCode, "", true);
format::FormatStyle style = cantFail(format::getStyle("LLVM", "-", "LLVM", newCode, nullptr));
style.ColumnLimit = 9999;
style.IndentWidth = 0;
style.ContinuationIndentWidth = 0;
style.SpaceBeforeAssignmentOperators = false;
style.SpaceBeforeParens = format::FormatStyle::SBPO_Never;
style.AlignEscapedNewlines = format::FormatStyle::ENAS_DontAlign;

format::FormattingAttemptStatus status;
std::vector<tooling::Range> ranges{{0, unsigned(newCode.size())}};
tooling::Replacements reps = format::reformat(style, newCode, ranges, "-", &status);
auto res = tooling::applyAllReplacements(newCode, reps);
if (!res)
errx(2, "failed to apply replacements: %s", toString(res.takeError()).c_str());
newCode = *res;
}
}

int main(int argc, char *argv[]) {
std::vector<const char *> args{argv[0], "-fsyntax-only"};
bool inplace = false;
const char *outfile = "/dev/stdout";
const char usage[] = R"(Usage: %s [-i] [-f fun]... a.c

Options:
-i edit a.c in place\n)";
for (int i = 1; i < argc; i++) {
StringRef opt(argv[i]);
if (opt[0] != '-')
args.push_back(argv[i]);
else if (opt == "-h") {
fputs(usage, stdout);
return 0;
} else if (opt == "-i")
inplace = true;
else if (opt == "-f" && i + 1 < argc)
ignores.push_back(argv[++i]);
else if (opt == "-o" && i + 1 < argc)
outfile = argv[++i];
else {
fputs(usage, stderr);
return 1;
}
}
ignores.push_back("main");

auto ci = buildCompilerInvocation(args);
if (!ci)
errx(1, "failed to build CompilerInvocation");

auto inst = std::make_unique<CompilerInstance>(std::make_shared<PCHContainerOperations>());
IgnoringDiagConsumer dc;
inst->setInvocation(std::move(ci));
inst->createDiagnostics(&dc, false);
inst->getDiagnostics().setIgnoreAllWarnings(true);
inst->setTarget(TargetInfo::CreateTargetInfo(inst->getDiagnostics(), inst->getInvocation().TargetOpts));
if (!inst->hasTarget())
errx(1, "hasTarget returns false");
inst->createFileManager(llvm::vfs::getRealFileSystem());
inst->setSourceManager(new SourceManager(inst->getDiagnostics(), inst->getFileManager(), true));

MiniAction action;
if (!action.BeginSourceFile(*inst, inst->getFrontendOpts().Inputs[0]))
errx(2, "failed to parse");
if (Error e = action.Execute())
errx(2, "failed to execute");
action.EndSourceFile();
reformat();

std::error_code ec;
raw_fd_ostream(inplace ? inst->getFrontendOpts().Inputs[0].getFile() : outfile, ec, sys::fs::OF_None) << newCode;
}

CMakeLists.txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
cmake_minimum_required(VERSION 3.14)
project(cminify LANGUAGES C CXX)

add_executable(cminify "")
set(DEFAULT_CMAKE_BUILD_TYPE Release)
set_property(TARGET cminify PROPERTY CXX_STANDARD 17)
set_property(TARGET cminify PROPERTY CXX_STANDARD_REQUIRED ON)
set_property(TARGET cminify PROPERTY CXX_EXTENSIONS OFF)

find_package(Clang REQUIRED)

if(CLANG_LINK_CLANG_DYLIB)
target_link_libraries(cminify PRIVATE clang-cpp)
else()
target_link_libraries(cminify PRIVATE
clangIndex
clangFormat
clangTooling
clangToolingInclusions
clangToolingCore
clangFrontend
clangParse
clangSerialization
clangSema
clangAST
clangLex
clangDriver
clangBasic
)
endif()

if(LLVM_LINK_LLVM_DYLIB)
target_link_libraries(cminify PRIVATE LLVM)
else()
target_link_libraries(cminify PRIVATE LLVMOption LLVMSupport)
endif()

if(NOT LLVM_ENABLE_RTTI)
# releases.llvm.org libraries are compiled with -fno-rtti
# The mismatch between lib{clang,LLVM}* and cminify can make libstdc++ std::make_shared return nullptr
# _Sp_counted_ptr_inplace::_M_get_deleter
if(MSVC)
target_compile_options(cminify PRIVATE /GR-)
else()
target_compile_options(cminify PRIVATE -fno-rtti)
endif()
endif()

target_sources(cminify PRIVATE main.cc)

foreach(include_dir ${LLVM_INCLUDE_DIRS} ${CLANG_INCLUDE_DIRS})
get_filename_component(include_dir_realpath ${include_dir} REALPATH)
# Don't add as SYSTEM if they are in CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES.
# It would reorder the system search paths and cause issues with libstdc++'s
# use of #include_next. See https://github.com/MaskRay/ccls/pull/417
if(NOT "${include_dir_realpath}" IN_LIST CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES)
target_include_directories(cminify SYSTEM PRIVATE ${include_dir})
endif()
endforeach()

install(TARGETS cminify RUNTIME DESTINATION bin)

Define LLVM as the llvm-project repository and LLVMOUT as the build directory (make sure you have at least built these targets: ninja clang clangFormat clangIndex clangTooling).

1
2
cmake -GNinja -S. -Bout/release -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$LLVMOUT;$LLVMOUT/tools/clang;$LLVM/llvm;$LLVM/clang"
ninja -C out/release

If LLVM and Clang's CMake, library, and header files are installed in well-known locations, then -DCMAKE_PREFIX_PATH can be omitted.

It's certainly not straightforward to find all these APIs. I mainly use ccls as a reference which was inspired by clangIndex. For writing this tool, I read a bit code of clang-rename, clang-format, and C-Reduce clang_delta. C-Reduce provides clang_delta/RenameFun.cpp and two other passes (RenameVar, RenameParam) which do similar stuff. Its code was a bit old now as it was written based on a Clang in circa 2012.

Let's see an example. Unfortunately I don't find clangFormat options removing whitespace after = and ,. That can perhaps be done by a post-processing string substitution tool without introducing too much risk.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
% cat test/a.c
#include <stdint.h>
#include <string.h>

#pragma GCC diagnostic ignored "-Wpragmas"
static float foo(int, int);
static float foo(int aaa, int bbb) { aaa = 3; bbb = 5; return 1.0f; }

struct NimStrPayload;
typedef struct NimStrPayload NimStrPayload;
struct NimStringV2;
typedef struct NimStringV2 NimStringV2;

struct NimStrPayload { int64_t cap; char data[]; };
struct NimStringV2 { int64_t cap; NimStrPayload *p; };

#define XX NimStringV2

float goo() {
int u, v, w, x, y, z;
int s1, t1, u1, v1, w1, x1, y1, z1;
NimStringV2 s;
XX t;
return 1.0f;
}

int main() {
char a[10];
memset(a, 0, 10);
float _ = foo(3, 5) + goo();
}
% out/release/cminify test/a.c
#include <stdint.h>
#include <string.h>

#pragma GCC diagnostic ignored "-Wpragmas"
static float a(int, int);
static float a(int k, int l) {
k= 3;
l= 5;
return 1.0f;
}

struct C;
typedef struct C A;
struct D;
typedef struct D B;

struct C {
int64_t cap;
char data[];
};
struct D {
int64_t cap;
A *p;
};

#define XX B

float b() {
int u, v, w, x, y, z;
int m, n, o, p, q, r, y1, z1;
B s;
XX t;
return 1.0f;
}

int main() {
char a[10];
memset(a, 0, 10);
float _= a(3, 5) + b();
}

Layering check with Clang

Updated in 2023-07.

This article describes some Clang header modules features that apply to #include. These features enforce a more explicit dependency graph, which provide documentation purposes and makes refactoring convenient. The benefits of clean header inclusions are well described in Include What You Use as well, so I won't repeat them here.

When using C++20 modules, these features apply to #include in a global module fragment (module;) but have no effect for import declarations.

Layering check

-fmodules-decluse

For a #include directive, this option emits an error if the following conditions are satisfied (see clang/lib/Lex/ModuleMap.cpp diagnoseHeaderInclusion):

  • The main file is within a module (called "source module", say, A).
  • The main file or an included file from the source module includes a file from another module B.
  • A does not have a use-declaration of B (no use B).

For the first condition, -fmodule-map-file= is needed to load the source module map and -fmodule-name=A is needed to indicate that the source file is logically part of module A.

For the second condition, the module map defining B must be loaded by specifying -fimplicit-module-maps (implied by -fmodules and -fcxx-modules) or a -fmodule-map-file=.

Read More

zstd compressed debug sections

Updated in 2022-10.

In January I wrote Compressed debug sections. The venerable zlib shows its age and there are replacements which are better in every metric except adoption and a larger memory footprint. The obvious choice was Zstandard, but I was not so confident about adoptinig it and solving the ecosystem issue. At any rate, I slowly removed some legacy .zdebug support from llvm-project so that a new format could be more easily introduced.

Read More

-march=, -mcpu=, and -mtune=

In GCC and Clang, there are three major options specifying the architecture and microarchitecture the generated code can run on. The general semantics are described below, but each target machine may assign different semantics.

  • -march=X: (execution domain) Generate code that can use instructions available in the architecture X
  • -mtune=X: (optimization domain) Optimize for the microarchitecture X, but does not change the ABI or make assumptions about available instructions
  • -mcpu=X: Specify both -march= and -mtune= but can be overridden by the two options. The supported values are generally the same as -mtune=. The architecture name is inferred from X

Read More

glibc and DT_GNU_HASH

tl;dr "Easy Anti-Cheat"'s incompatibility with glibc 2.36 provides shared objects (libc.so.6, ld-linux-x86_64.so.2) is an instance of Hyrum's law.

I feel compelled to demystify the accident and wish that people can stop defamation to glibc.

Read More

Everything I know about glibc

Updated in 2023-08.

glibc is an implementation of the user-space side of standard C/POSIX functions with Linux extensions.

Read More

C standard library headers in C++

Updated in 2024-01.

In ISO C++ standards, [support.c.headers.general] says:

Source files that are not intended to also be valid ISO C should not use any of the C headers.

Then, [depr.c.headers] describes how a C header name.h is transformed to the corresponding C++ cname header. There is a helpful example:

[ Example: The header assuredly provides its declarations and definitions within the namespace std. It may also provide these names within the global namespace. The header <stdlib.h> assuredly provides the same declarations and definitions within the global namespace, much as in the C Standard. It may also provide these names within the namespace std. — end example ]

"may also" in the wording allows implementations to provide mix-and-match, e.g. #include <stdlib.h> may provide std::exit and #include <cstdlib> may provide ::exit.

libstdc++ chooses to enable global namespace declarations with C++ cname header. For example, #include <cstdlib> also includes the corresponding C header stdlib.h and we get declarations in both the global namespace and the namespace std.

1
2
. /usr/include/c++/12/cstdlib
.. /usr/include/stdlib.h

The preprocessed output looks like:

1
2
3
4
5
6
7
8
9
extern void exit (int __status) noexcept (true) __attribute__ ((__noreturn__));

extern "C++"
{
namespace std __attribute__ ((__visibility__ ("default")))
{
using ::exit;
}
}

The compiler knows that the declarations in the namespace std are identical to the ones in the global namespace. The compiler recognizes some library functions and can optimize them. By using the compiler can optimize some C library functions in the namespace std (e.g. many std::mem* and std::str* functions).

For some C standard library headers, libstdc++ provides wrappers (libstdc++-v3/include/c_compatibility/) which take precedence over the glibc headers. The configuration of libstdc++ uses --enable-cheaders=c_global by default. if GLIBCXX_C_HEADERS_C_GLOBAL in libstdc++-v3/include/Makefile.am describes that the 6 wrappers (complex.h, fenv.h, tgmath.h, math.h, stdatomic.h, stdlib.h) shadow the C library headers of the same name. For example, #include <stdlib.h> includes the wrapper stdlib.h which includes cstdlib, therefore bringing exit into the namespace std.

1
2
3
. /usr/include/c++/12/stdlib.h
.. /usr/include/c++/12/cstdlib
... /usr/include/stdlib.h

Read More