[PATCH] Fix race in DWARF reader

Wed Oct 18 23:51:23 GMT 2023

The recent change to record the DWARF language in the per-CU data
yielded a race warning in my testing:

ThreadSanitizer: data race ../../binutils-gdb/gdb/dwarf2/read.c:21779 in prepare_one_comp_unit

This patch fixes the bug by applying the same style of fix that was
done for the ordinary (gdb) language.

I wonder if this code could be improved.  Requiring an atomic for the
language in particular seems unfortunate, as it is often consulted
during index finalization.  However, I haven't investigated this.

Regression tested on x86-64 Fedora 38.
---
 gdb/dwarf2/index-write.c |  2 +-
 gdb/dwarf2/read.c        | 23 ++++++++++++++++++++--
 gdb/dwarf2/read.h        | 41 +++++++++++++++++++---------------------
 3 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/gdb/dwarf2/index-write.c b/gdb/dwarf2/index-write.c
index bac4a6c6934..8c87f053da6 100644
--- a/gdb/dwarf2/index-write.c
+++ b/gdb/dwarf2/index-write.c
@@ -1205,7 +1205,7 @@ write_shortcuts_table (cooked_index *table, data_buf &shortcuts,
 
   if (main_info != nullptr)
     {
-      dw_lang = main_info->per_cu->dw_lang;
+      dw_lang = main_info->per_cu->dw_lang ();
 
       if (dw_lang != 0)
 	{
diff --git a/gdb/dwarf2/read.c b/gdb/dwarf2/read.c
index c85eaac3035..36c1d9f4cd6 100644
--- a/gdb/dwarf2/read.c
+++ b/gdb/dwarf2/read.c
@@ -21615,6 +21615,26 @@ dwarf2_per_cu_data::ref_addr_size () const
     return header->offset_size;
 }
 
+/* See read.h.  */
+
+void
+dwarf2_per_cu_data::set_lang (enum language lang,
+			      dwarf_source_language dw_lang)
+{
+  if (unit_type () == DW_UT_partial)
+    return;
+
+  /* Set if not set already.  */
+  packed<language, LANGUAGE_BYTES> new_value = lang;
+  packed<language, LANGUAGE_BYTES> old_value = m_lang.exchange (new_value);
+  /* If already set, verify that it's the same value.  */
+  gdb_assert (old_value == language_unknown || old_value == lang);
+
+  packed<dwarf_source_language, 2> new_dw = dw_lang;
+  packed<dwarf_source_language, 2> old_dw = m_dw_lang.exchange (new_dw);
+  gdb_assert (old_dw == 0 || old_dw == dw_lang);
+}
+
 /* A helper function for dwarf2_find_containing_comp_unit that returns
    the index of the result, and that searches a vector.  It will
    return a result even if the offset in question does not actually
@@ -21776,7 +21796,6 @@ prepare_one_comp_unit (struct dwarf2_cu *cu, struct die_info *comp_unit_die,
   else
     lang = pretend_language;
 
-  cu->per_cu->dw_lang = dw_lang;
   cu->language_defn = language_def (lang);
 
   switch (comp_unit_die->tag)
@@ -21796,7 +21815,7 @@ prepare_one_comp_unit (struct dwarf2_cu *cu, struct die_info *comp_unit_die,
 	     sect_offset_str (cu->per_cu->sect_off));
     }
 
-  cu->per_cu->set_lang (lang);
+  cu->per_cu->set_lang (lang, dw_lang);
 }
 
 /* See read.h.  */
diff --git a/gdb/dwarf2/read.h b/gdb/dwarf2/read.h
index c92474d8b9d..5d038d3c620 100644
--- a/gdb/dwarf2/read.h
+++ b/gdb/dwarf2/read.h
@@ -183,6 +183,15 @@ struct dwarf2_per_cu_data
   /* The language of this CU.  */
   std::atomic<packed<language, LANGUAGE_BYTES>> m_lang {language_unknown};
 
+  /* The original DW_LANG_* value of the CU, as provided to us by
+     DW_AT_language. It is interesting to keep this value around in cases where
+     we can't use the values from the language enum, as the mapping to them is
+     lossy, and, while that is usually fine, things like the index have an
+     understandable bias towards not exposing internal GDB structures to the
+     outside world, and so prefer to use DWARF constants in their stead. */
+  std::atomic<packed<dwarf_source_language, 2>> m_dw_lang
+       { (dwarf_source_language) 0 };
+
 public:
   /* True if this CU has been scanned by the indexer; false if
      not.  */
@@ -245,14 +254,6 @@ struct dwarf2_per_cu_data
      functions above.  */
   std::vector <dwarf2_per_cu_data *> *imported_symtabs = nullptr;
 
-  /* The original DW_LANG_* value of the CU, as provided to us by
-     DW_AT_language. It is interesting to keep this value around in cases where
-     we can't use the values from the language enum, as the mapping to them is
-     lossy, and, while that is usually fine, things like the index have an
-     understandable bias towards not exposing internal GDB structures to the
-     outside world, and so prefer to use DWARF constants in their stead. */
-  dwarf_source_language dw_lang;
-
   /* Return true of IMPORTED_SYMTABS is empty or not yet allocated.  */
   bool imported_symtabs_empty () const
   {
@@ -365,20 +366,16 @@ struct dwarf2_per_cu_data
     return l;
   }
 
-  void set_lang (enum language lang)
-  {
-    if (unit_type () == DW_UT_partial)
-      return;
-    /* Set if not set already.  */
-    packed<language, LANGUAGE_BYTES> nope = language_unknown;
-    if (m_lang.compare_exchange_strong (nope, lang))
-      return;
-    /* If already set, verify that it's the same value.  */
-    nope = lang;
-    if (m_lang.compare_exchange_strong (nope, lang))
-      return;
-    gdb_assert_not_reached ();
-  }
+  /* Return the language of this CU, as a DWARF DW_LANG_* value.  This
+     may be 0 in some situations.  */
+  dwarf_source_language dw_lang () const
+  { return m_dw_lang; }
+
+  /* Set the language of this CU.  LANG is the language in gdb terms,
+     and DW_LANG is the language as a DW_LANG_* value.  These may
+     differ, as DW_LANG can be 0 for included units, whereas in this
+     situation LANG would be set by the importing CU.  */
+  void set_lang (enum language lang, dwarf_source_language dw_lang);
 
   /* Free any cached file names.  */
   void free_cached_file_names ();
-- 
2.41.0